diff --git a/projects/hip-tests/catch/unit/atomics/CMakeLists.txt b/projects/hip-tests/catch/unit/atomics/CMakeLists.txt new file mode 100644 index 0000000000..fc30955d5a --- /dev/null +++ b/projects/hip-tests/catch/unit/atomics/CMakeLists.txt @@ -0,0 +1,42 @@ +# Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +set(TEST_SRC + atomicCAS.cc + atomicCAS_system.cc +) + +if(HIP_PLATFORM MATCHES "nvidia") + set_source_files_properties(atomicCAS_system.cc PROPERTIES COMPILE_FLAGS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80") + hip_add_exe_to_target(NAME AtomicsTest + TEST_SRC ${TEST_SRC} + TEST_TARGET_NAME build_tests + LINKER_LIBS "nvrtc -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80") +elseif(HIP_PLATFORM MATCHES "amd") + hip_add_exe_to_target(NAME AtomicsTest + TEST_SRC ${TEST_SRC} + TEST_TARGET_NAME build_tests + LINKER_LIBS hiprtc) +endif() + +add_test(NAME Unit_atomicCAS_Negative_Parameters + COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../compileAndCaptureOutput.py + ${CMAKE_CURRENT_SOURCE_DIR} ${HIP_PLATFORM} ${HIP_PATH} + atomicCAS_negative_kernels.cc 48) diff --git a/projects/hip-tests/catch/unit/atomics/arithmetic_common.hh b/projects/hip-tests/catch/unit/atomics/arithmetic_common.hh new file mode 100644 index 0000000000..384665b502 --- /dev/null +++ b/projects/hip-tests/catch/unit/atomics/arithmetic_common.hh @@ -0,0 +1,449 @@ +/* +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#include +#include +#include +#include + +namespace cg = cooperative_groups; + +enum class AtomicOperation { + kAdd = 0, + kAddSystem, + kSub, + kSubSystem, + kInc, + kDec, + kUnsafeAdd, + kSafeAdd, + kCASAdd, + kCASAddSystem +}; + +constexpr auto kIntegerTestValue = 7; +constexpr auto kFloatingPointTestValue = 3.125; +constexpr auto kIncDecWraparoundValue = 1023; + +template +__host__ __device__ TestType GetTestValue() { + if constexpr (operation == AtomicOperation::kInc || operation == AtomicOperation::kDec) { + return kIncDecWraparoundValue; + } + + return std::is_floating_point_v ? kFloatingPointTestValue : kIntegerTestValue; +} + +template __device__ TestType CASAtomicAdd(TestType* address, TestType val) { + TestType old = *address, assumed; + + do { + assumed = old; + old = atomicCAS(address, assumed, val + assumed); + } while (assumed != old); + + return old; +} + +template +__device__ TestType CASAtomicAddSystem(TestType* address, TestType val) { + TestType old = *address, assumed; + + do { + assumed = old; + old = atomicCAS_system(address, assumed, val + assumed); + } while (assumed != old); + + return old; +} + +template +__device__ TestType PerformAtomicOperation(TestType* const mem) { + const auto val = GetTestValue(); + + if constexpr (operation == AtomicOperation::kAdd) { + return atomicAdd(mem, val); + } else if constexpr (operation == AtomicOperation::kAddSystem) { + return atomicAdd_system(mem, val); + } else if constexpr (operation == AtomicOperation::kSub) { + return atomicSub(mem, val); + } else if constexpr (operation == AtomicOperation::kSubSystem) { + return atomicSub_system(mem, val); + } else if constexpr (operation == AtomicOperation::kInc) { + return atomicInc(mem, val); + } else if constexpr (operation == AtomicOperation::kDec) { + return atomicDec(mem, val); + } else if constexpr (operation == AtomicOperation::kUnsafeAdd) { + return unsafeAtomicAdd(mem, val); + } else if constexpr (operation == AtomicOperation::kSafeAdd) { + return safeAtomicAdd(mem, val); + } else if constexpr (operation == AtomicOperation::kCASAdd) { + return CASAtomicAdd(mem, val); + } else if constexpr (operation == AtomicOperation::kCASAddSystem) { + return CASAtomicAddSystem(mem, val); + } +} + +template +__global__ void TestKernel(TestType* const global_mem, TestType* const old_vals) { + __shared__ TestType shared_mem; + + const auto tid = cg::this_grid().thread_rank(); + + TestType* const mem = use_shared_mem ? &shared_mem : global_mem; + + if constexpr (use_shared_mem) { + if (tid == 0) mem[0] = global_mem[0]; + __syncthreads(); + } + + old_vals[tid] = PerformAtomicOperation(mem); + + if constexpr (use_shared_mem) { + __syncthreads(); + if (tid == 0) global_mem[0] = mem[0]; + } +} + +template +__host__ __device__ TestType* PitchedOffset(TestType* const ptr, const unsigned int pitch, + const unsigned int idx) { + const auto byte_ptr = reinterpret_cast(ptr); + return reinterpret_cast(byte_ptr + idx * pitch); +} + +template +__global__ void TestKernel(TestType* const global_mem, TestType* const old_vals, + const unsigned int width, const unsigned pitch) { + extern __shared__ uint8_t shared_mem[]; + + const auto tid = cg::this_grid().thread_rank(); + + TestType* const mem = use_shared_mem ? reinterpret_cast(shared_mem) : global_mem; + + if constexpr (use_shared_mem) { + if (tid < width) { + const auto target = PitchedOffset(mem, pitch, tid); + *target = *PitchedOffset(global_mem, pitch, tid); + }; + __syncthreads(); + } + + old_vals[tid] = + PerformAtomicOperation(PitchedOffset(mem, pitch, tid % width)); + + if constexpr (use_shared_mem) { + __syncthreads(); + if (tid < width) { + const auto target = PitchedOffset(global_mem, pitch, tid); + *target = *PitchedOffset(mem, pitch, tid); + }; + } +} + +struct TestParams { + auto ThreadCount() const { + return blocks.x * blocks.y * blocks.z * threads.x * threads.y * threads.z; + } + + auto HostIterationsPerThread() const { + return std::max(num_devices * kernel_count * ThreadCount() / 20, width); + } + + dim3 blocks; + dim3 threads; + unsigned int num_devices = 1u; + unsigned int kernel_count = 1u; + unsigned int width = 1u; + unsigned int pitch = 0u; + unsigned int host_thread_count = 0u; + LinearAllocs alloc_type; +}; + +template +std::tuple, std::vector> TestKernelHostRef(const TestParams& p) { + const auto val = GetTestValue(); + + const auto total_thread_count = p.num_devices * p.kernel_count * p.ThreadCount() + + p.host_thread_count * p.HostIterationsPerThread(); + + std::vector res_vals(p.width); + std::vector old_vals; + old_vals.reserve(total_thread_count); + + auto perform_op = [&](unsigned id) { + auto& res = res_vals[id % p.width]; + old_vals.push_back(res); + + if constexpr (operation == AtomicOperation::kAdd || operation == AtomicOperation::kAddSystem || + operation == AtomicOperation::kUnsafeAdd || + operation == AtomicOperation::kSafeAdd || operation == AtomicOperation::kCASAdd || + operation == AtomicOperation::kCASAddSystem) { + res = res + val; + } else if constexpr (operation == AtomicOperation::kSub || + operation == AtomicOperation::kSubSystem) { + res = res - val; + } else if constexpr (operation == AtomicOperation::kInc) { + res = (res >= val) ? 0 : res + 1; + } else if constexpr (operation == AtomicOperation::kDec) { + res = ((res == 0) || (res > val)) ? val : res - 1; + } + }; + + for (auto i = 0u; i < p.num_devices; ++i) { + for (auto j = 0u; j < p.kernel_count; ++j) { + for (auto tid = 0u; tid < p.ThreadCount(); ++tid) { + perform_op(tid); + } + } + } + + for (auto i = 0u; i < p.host_thread_count; ++i) { + for (auto j = 0u; j < p.HostIterationsPerThread(); ++j) { + perform_op(j); + } + } + + return {res_vals, old_vals}; +} + +template +void Verify(const TestParams& p, std::vector& res_vals, std::vector& old_vals) { + auto [expected_res_vals, expected_old_vals] = TestKernelHostRef(p); + + for (auto i = 0u; i < res_vals.size(); ++i) { + INFO("Results index: " << i); + REQUIRE(expected_res_vals[i] == res_vals[i]); + } + + std::sort(begin(old_vals), end(old_vals)); + std::sort(begin(expected_old_vals), end(expected_old_vals)); + for (auto i = 0u; i < old_vals.size(); ++i) { + INFO("Old values index: " << i); + REQUIRE(expected_old_vals[i] == old_vals[i]); + } +} + +template +void LaunchKernel(const TestParams& p, hipStream_t stream, TestType* const mem_ptr, + TestType* const old_vals) { + const auto shared_mem_size = use_shared_mem ? p.width * p.pitch : 0u; + if (p.width == 1 && p.pitch == sizeof(TestType)) + TestKernel + <<>>(mem_ptr, old_vals); + else + TestKernel + <<>>(mem_ptr, old_vals, p.width, p.pitch); +} + +template +void HostAtomicOperation(const unsigned int iterations, TestType* mem, TestType* const old_vals, + const unsigned int width, const unsigned pitch, TestType base_val) { + const auto val = GetTestValue(); + + for (auto i = 0u; i < iterations; ++i) { + if constexpr (operation == AtomicOperation::kAddSystem || + operation == AtomicOperation::kCASAddSystem) { + old_vals[i] = __atomic_fetch_add(PitchedOffset(mem, pitch, i % width), val, __ATOMIC_RELAXED); + } else if constexpr (operation == AtomicOperation::kSubSystem) { + old_vals[i] = __atomic_fetch_sub(PitchedOffset(mem, pitch, i % width), val, __ATOMIC_RELAXED); + } + } +} + +template +void PerformHostAtomicOperation(const TestParams& p, TestType* mem, TestType* const old_vals) { + if (p.host_thread_count == 0) { + return; + } + + const auto host_base_val = p.num_devices * p.kernel_count * p.ThreadCount(); + + std::vector threads; + for (auto i = 0u; i < p.host_thread_count; ++i) { + const auto iterations = p.HostIterationsPerThread(); + const auto thread_base_val = host_base_val + i * iterations; + threads.push_back(std::thread(HostAtomicOperation, iterations, mem, + old_vals + thread_base_val, p.width, p.pitch, thread_base_val)); + } + + for (auto& th : threads) { + th.join(); + } +} + +template +void TestCore(const TestParams& p) { + const unsigned int flags = + p.alloc_type == LinearAllocs::mallocAndRegister ? hipHostRegisterMapped : 0u; + + const auto old_vals_alloc_size = p.kernel_count * p.ThreadCount() * sizeof(TestType); + std::vector> old_vals_devs; + std::vector streams; + for (auto i = 0; i < p.num_devices; ++i) { + HIP_CHECK(hipSetDevice(i)); + old_vals_devs.emplace_back(LinearAllocs::hipMalloc, old_vals_alloc_size); + for (auto j = 0; j < p.kernel_count; ++j) { + streams.emplace_back(Streams::created); + } + } + + const auto mem_alloc_size = p.width * p.pitch; + LinearAllocGuard mem_dev(p.alloc_type, mem_alloc_size, flags); + + std::vector old_vals(p.num_devices * p.kernel_count * p.ThreadCount() + + p.host_thread_count * p.HostIterationsPerThread()); + std::vector res_vals(p.width); + + TestType* const mem_ptr = + p.alloc_type == LinearAllocs::hipMalloc ? mem_dev.ptr() : mem_dev.host_ptr(); + + HIP_CHECK(hipMemset(mem_ptr, 0, mem_alloc_size)); + + for (auto i = 0u; i < p.num_devices; ++i) { + for (auto j = 0u; j < p.kernel_count; ++j) { + const auto& stream = streams[i * p.kernel_count + j].stream(); + const auto old_vals = old_vals_devs[i].ptr() + j * p.ThreadCount(); + LaunchKernel(p, stream, mem_dev.ptr(), old_vals); + } + } + + PerformHostAtomicOperation(p, mem_dev.host_ptr(), old_vals.data()); + + for (auto i = 0u; i < p.num_devices; ++i) { + const auto device_offset = i * p.kernel_count * p.ThreadCount(); + HIP_CHECK(hipMemcpy(old_vals.data() + device_offset, old_vals_devs[i].ptr(), + old_vals_alloc_size, hipMemcpyDeviceToHost)); + } + HIP_CHECK(hipMemcpy2D(res_vals.data(), sizeof(TestType), mem_ptr, p.pitch, sizeof(TestType), + p.width, hipMemcpyDeviceToHost)); + + Verify(p, res_vals, old_vals); +} + +inline dim3 GenerateThreadDimensions() { return GENERATE(dim3(16), dim3(1024)); } + +inline dim3 GenerateBlockDimensions() { + int sm_count = 0; + HIP_CHECK(hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, 0)); + return GENERATE_COPY(dim3(sm_count), dim3(sm_count + sm_count / 2)); +} + +template +void SingleDeviceSingleKernelTest(const unsigned int width, const unsigned int pitch) { + TestParams params; + params.num_devices = 1; + params.kernel_count = 1; + params.threads = GenerateThreadDimensions(); + params.width = width; + params.pitch = pitch; + + SECTION("Global memory") { + params.blocks = GenerateBlockDimensions(); + using LA = LinearAllocs; + for (const auto alloc_type : + {LA::hipMalloc, LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) { + params.alloc_type = alloc_type; + DYNAMIC_SECTION("Allocation type: " << to_string(alloc_type)) { + TestCore(params); + } + } + } + + SECTION("Shared memory") { + params.blocks = dim3(1); + params.alloc_type = LinearAllocs::hipMalloc; + TestCore(params); + } +} + +template +void SingleDeviceMultipleKernelTest(const unsigned int kernel_count, const unsigned int width, + const unsigned int pitch) { + int concurrent_kernels = 0; + HIP_CHECK(hipDeviceGetAttribute(&concurrent_kernels, hipDeviceAttributeConcurrentKernels, 0)); + if (!concurrent_kernels) { + HipTest::HIP_SKIP_TEST("Test requires support for concurrent kernel execution"); + return; + } + + TestParams params; + params.num_devices = 1; + params.kernel_count = kernel_count; + params.blocks = GenerateBlockDimensions(); + params.threads = GenerateThreadDimensions(); + params.width = width; + params.pitch = pitch; + + using LA = LinearAllocs; + for (const auto alloc_type : + {LA::hipMalloc, LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) { + params.alloc_type = alloc_type; + DYNAMIC_SECTION("Allocation type: " << to_string(alloc_type)) { + TestCore(params); + } + } +} + +template +void MultipleDeviceMultipleKernelAndHostTest(const unsigned int num_devices, + const unsigned int kernel_count, + const unsigned int width, const unsigned int pitch, + const unsigned int host_thread_count = 0u) { + if (num_devices > 1) { + if (HipTest::getDeviceCount() < num_devices) { + std::string msg = std::to_string(num_devices) + " devices are required"; + HipTest::HIP_SKIP_TEST(msg.c_str()); + return; + } + } + + if (kernel_count > 1) { + for (auto i = 0u; i < num_devices; ++i) { + int concurrent_kernels = 0; + HIP_CHECK(hipDeviceGetAttribute(&concurrent_kernels, hipDeviceAttributeConcurrentKernels, i)); + if (!concurrent_kernels) { + HipTest::HIP_SKIP_TEST("Test requires support for concurrent kernel execution"); + return; + } + } + } + + TestParams params; + params.num_devices = num_devices; + params.kernel_count = kernel_count; + params.blocks = GenerateBlockDimensions(); + params.threads = GenerateThreadDimensions(); + params.width = width; + params.pitch = pitch; + params.host_thread_count = host_thread_count; + + using LA = LinearAllocs; + for (const auto alloc_type : {LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) { + params.alloc_type = alloc_type; + DYNAMIC_SECTION("Allocation type: " << to_string(alloc_type)) { + TestCore(params); + } + } +} \ No newline at end of file diff --git a/projects/hip-tests/catch/unit/atomics/atomicCAS.cc b/projects/hip-tests/catch/unit/atomics/atomicCAS.cc new file mode 100644 index 0000000000..3be684306d --- /dev/null +++ b/projects/hip-tests/catch/unit/atomics/atomicCAS.cc @@ -0,0 +1,172 @@ +/* +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "arithmetic_common.hh" +#include "atomicCAS_negative_kernels_rtc.hh" + +#include + +/** + * @addtogroup atomicCAS atomicCAS + * @{ + * @ingroup AtomicsTest + */ + +#ifdef HT_NVIDIA +#define TYPES +#else +#define TYPES , float, double +#endif + +/** + * Test Description + * ------------------------ + * - Executes a single kernel on a single device wherein all threads will perform an atomic + * addition, implemented using an atomic CAS operation, on a target memory location. Each thread + * will add the same value to the memory location, storing the return value into a separate output + * array slot corresponding to it. Once complete, the output array and target memory is validated to + * contain all the expected values. Several memory access patterns are tested: + * -# All threads exchange to a single, compile time deducible, memory location + * -# Each thread targets an array containing warp_size elements, using tid % warp_size + * for indexing + * -# Same as the above, but the elements are spread out by L1 cache line size bytes. + * + * - The test is run for: + * - All overloads of atomicCAS + * - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory + * - Shared memory + * - Several grid and block dimension combinations (only one block is used for shared memory). + * Test source + * ------------------------ + * - unit/atomics/atomicCAS.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEMPLATE_TEST_CASE("Unit_atomicCAS_Positive", "", int, unsigned int, unsigned long long TYPES) { + int warp_size = 0; + HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0)); + const auto cache_line_size = 128u; + + for (auto current = 0; current < cmd_options.iterations; ++current) { + DYNAMIC_SECTION("Same address " << current) { + SingleDeviceSingleKernelTest(1, sizeof(TestType)); + } + + DYNAMIC_SECTION("Adjacent addresses " << current) { + SingleDeviceSingleKernelTest(warp_size, sizeof(TestType)); + } + + DYNAMIC_SECTION("Scattered addresses " << current) { + SingleDeviceSingleKernelTest(warp_size, cache_line_size); + } + } +} + +/** + * Test Description + * ------------------------ + * - Executes a kernel two times concurrently on a single device wherein all threads will perform + * an atomic addition, implemented using an atomic CAS operation, on a target memory location. Each + * thread will add the same value to the memory location, storing the return value into a separate + * output array slot corresponding to it. Once complete, the output array and target memory is + * validated to contain all the expected values. Several memory access patterns are tested: + * -# All threads exchange to a single, compile time deducible, memory location + * -# Each thread targets an array containing warp_size elements, using tid % warp_size + * for indexing + * -# Same as the above, but the elements are spread out by L1 cache line size bytes. + * + * - The test is run for: + * - All overloads of atomicCAS + * - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory + * - Several grid and block dimension combinations. + * Test source + * ------------------------ + * - unit/atomics/atomicCAS.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEMPLATE_TEST_CASE("Unit_atomicCAS_Positive_Multi_Kernel", "", int, unsigned int, + unsigned long long TYPES) { + int warp_size = 0; + HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0)); + const auto cache_line_size = 128u; + + for (auto current = 0; current < cmd_options.iterations; ++current) { + DYNAMIC_SECTION("Same address " << current) { + SingleDeviceMultipleKernelTest(2, 1, sizeof(TestType)); + } + + DYNAMIC_SECTION("Adjacent addresses " << current) { + SingleDeviceMultipleKernelTest(2, warp_size, + sizeof(TestType)); + } + + DYNAMIC_SECTION("Scattered addresses " << current) { + SingleDeviceMultipleKernelTest(2, warp_size, + cache_line_size); + } + } +} + +/** + * Test Description + * ------------------------ + * - RTCs kernels that pass combinations of arguments of invalid types for all overloads of + * atomicCAS. + * Test source + * ------------------------ + * - unit/atomics/atomicCAS.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Unit_atomicCAS_Negative_Parameters_RTC") { + hiprtcProgram program{}; + + const auto program_source = GENERATE(kAtomicCAS_int, kAtomicCAS_uint, kAtomicCAS_ulong, + kAtomicCAS_ulonglong, kAtomicCAS_float, kAtomicCAS_double); + HIPRTC_CHECK( + hiprtcCreateProgram(&program, program_source, "atomicCAS_negative.cc", 0, nullptr, nullptr)); + hiprtcResult result{hiprtcCompileProgram(program, 0, nullptr)}; + + // Get the compile log and count compiler error messages + size_t log_size{}; + HIPRTC_CHECK(hiprtcGetProgramLogSize(program, &log_size)); + std::string log(log_size, ' '); + HIPRTC_CHECK(hiprtcGetProgramLog(program, log.data())); + int error_count{0}; + + int expected_error_count{8}; + std::string error_message{"error:"}; + + size_t n_pos = log.find(error_message, 0); + while (n_pos != std::string::npos) { + ++error_count; + n_pos = log.find(error_message, n_pos + 1); + } + + HIPRTC_CHECK(hiprtcDestroyProgram(&program)); + HIPRTC_CHECK_ERROR(result, HIPRTC_ERROR_COMPILATION); + REQUIRE(error_count == expected_error_count); +} diff --git a/projects/hip-tests/catch/unit/atomics/atomicCAS_negative_kernels.cc b/projects/hip-tests/catch/unit/atomics/atomicCAS_negative_kernels.cc new file mode 100644 index 0000000000..b0390bb3fa --- /dev/null +++ b/projects/hip-tests/catch/unit/atomics/atomicCAS_negative_kernels.cc @@ -0,0 +1,62 @@ +/* +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include + +class Dummy { + public: + __device__ Dummy() {} + __device__ ~Dummy() {} +}; + +#define ATOMIC_CAS_NEGATIVE_KERNEL(type_name) \ + __global__ void atomicCAS_v1(type_name* address, type_name* result) { \ + *result = atomicCAS(&address, 12, 13); \ + } \ + __global__ void atomicCAS_v2(type_name* address, type_name* result) { \ + *result = atomicCAS(address, address, 13); \ + } \ + __global__ void atomicCAS_v3(type_name* address, type_name* result) { \ + *result = atomicCAS(address, 12, address); \ + } \ + __global__ void atomicCAS_v4(Dummy* address, type_name* result) { \ + *result = atomicCAS(address, 12, 13); \ + } \ + __global__ void atomicCAS_v5(char* address, type_name* result) { \ + *result = atomicCAS(address, 12, 13); \ + } \ + __global__ void atomicCAS_v6(short* address, type_name* result) { \ + *result = atomicCAS(address, 12, 13); \ + } \ + __global__ void atomicCAS_v7(long* address, type_name* result) { \ + *result = atomicCAS(address, 12, 13); \ + } \ + __global__ void atomicCAS_v8(long long* address, type_name* result) { \ + *result = atomicCAS(address, 12, 13); \ + } + +ATOMIC_CAS_NEGATIVE_KERNEL(int) +ATOMIC_CAS_NEGATIVE_KERNEL(unsigned int) +ATOMIC_CAS_NEGATIVE_KERNEL(unsigned long) +ATOMIC_CAS_NEGATIVE_KERNEL(unsigned long long) +ATOMIC_CAS_NEGATIVE_KERNEL(float) +ATOMIC_CAS_NEGATIVE_KERNEL(double) diff --git a/projects/hip-tests/catch/unit/atomics/atomicCAS_negative_kernels_rtc.hh b/projects/hip-tests/catch/unit/atomics/atomicCAS_negative_kernels_rtc.hh new file mode 100644 index 0000000000..952c4892fb --- /dev/null +++ b/projects/hip-tests/catch/unit/atomics/atomicCAS_negative_kernels_rtc.hh @@ -0,0 +1,273 @@ +/* +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +/* +Negative kernels used for the atomics negative Test Cases that are using RTC. +*/ + +static constexpr auto kAtomicCAS_int{ + R"( + class Dummy { + public: + __device__ Dummy() {} + __device__ ~Dummy() {} + }; + + __global__ void atomicCAS_int_v1(int* address, int* result) { + *result = atomicCAS(&address, 12, 13); + } + + __global__ void atomicCAS_int_v2(int* address, int* result) { + *result = atomicCAS(address, address, 13); + } + + __global__ void atomicCAS_int_v3(int* address, int* result) { + *result = atomicCAS(address, 12, address); + } + + __global__ void atomicCAS_int_v4(Dummy* address, int* result) { + *result = atomicCAS(address, 12, 13); + } + + __global__ void atomicCAS_int_v5(char* address, int* result) { + *result = atomicCAS(address, 12, 13); + } + + __global__ void atomicCAS_int_v6(short* address, int* result) { + *result = atomicCAS(address, 12, 13); + } + + __global__ void atomicCAS_int_v7(long* address, int* result) { + *result = atomicCAS(address, 12, 13); + } + + __global__ void atomicCAS_int_v8(long long* address, int* result) { + *result = atomicCAS(address, 12, 13); + } + )"}; + +static constexpr auto kAtomicCAS_uint{ + R"( + class Dummy { + public: + __device__ Dummy() {} + __device__ ~Dummy() {} + }; + + __global__ void atomicCAS_uint_v1(unsigned int* address, unsigned int* result) { + *result = atomicCAS(&address, 12, 13); + } + + __global__ void atomicCAS_uint_v2(unsigned int* address, unsigned int* result) { + *result = atomicCAS(address, address, 13); + } + + __global__ void atomicCAS_uint_v3(unsigned int* address, unsigned int* result) { + *result = atomicCAS(address, 12, address); + } + + __global__ void atomicCAS_uint_v4(Dummy* address, unsigned int* result) { + *result = atomicCAS(address, 12, 13); + } + + __global__ void atomicCAS_uint_v5(char* address, unsigned int* result) { + *result = atomicCAS(address, 12, 13); + } + + __global__ void atomicCAS_uint_v6(short* address, unsigned int* result) { + *result = atomicCAS(address, 12, 13); + } + + __global__ void atomicCAS_uint_v7(long* address, unsigned int* result) { + *result = atomicCAS(address, 12, 13); + } + + __global__ void atomicCAS_uint_v8(long long* address, unsigned int* result) { + *result = atomicCAS(address, 12, 13); + } + )"}; + +static constexpr auto kAtomicCAS_ulong{ + R"( + class Dummy { + public: + __device__ Dummy() {} + __device__ ~Dummy() {} + }; + + __global__ void atomicCAS_ulong_v1(unsigned long* address, unsigned long* result) { + *result = atomicCAS(&address, 12, 13); + } + + __global__ void atomicCAS_ulong_v2(unsigned long* address, unsigned long* result) { + *result = atomicCAS(address, address, 13); + } + + __global__ void atomicCAS_ulong_v3(unsigned long* address, unsigned long* result) { + *result = atomicCAS(address, 12, address); + } + + __global__ void atomicCAS_ulong_v4(Dummy* address, unsigned long* result) { + *result = atomicCAS(address, 12, 13); + } + + __global__ void atomicCAS_ulong_v5(char* address, unsigned long* result) { + *result = atomicCAS(address, 12, 13); + } + + __global__ void atomicCAS_ulong_v6(short* address, unsigned long* result) { + *result = atomicCAS(address, 12, 13); + } + + __global__ void atomicCAS_ulong_v7(long* address, unsigned long* result) { + *result = atomicCAS(address, 12, 13); + } + + __global__ void atomicCAS_ulong_v8(long long* address, unsigned long* result) { + *result = atomicCAS(address, 12, 13); + } + )"}; + +static constexpr auto kAtomicCAS_ulonglong{ + R"( + class Dummy { + public: + __device__ Dummy() {} + __device__ ~Dummy() {} + }; + + __global__ void atomicCAS_ulonglong_v1(unsigned long long* address, unsigned long long* result) { + *result = atomicCAS(&address, 12, 13); + } + + __global__ void atomicCAS_ulonglong_v2(unsigned long long* address, unsigned long long* result) { + *result = atomicCAS(address, address, 13); + } + + __global__ void atomicCAS_ulonglong_v3(unsigned long long* address, unsigned long long* result) { + *result = atomicCAS(address, 12, address); + } + + __global__ void atomicCAS_ulonglong_v4(Dummy* address, unsigned long long* result) { + *result = atomicCAS(address, 12, 13); + } + + __global__ void atomicCAS_ulonglong_v5(char* address, unsigned long long* result) { + *result = atomicCAS(address, 12, 13); + } + + __global__ void atomicCAS_ulonglong_v6(short* address, unsigned long long* result) { + *result = atomicCAS(address, 12, 13); + } + + __global__ void atomicCAS_ulonglong_v7(long* address, unsigned long long* result) { + *result = atomicCAS(address, 12, 13); + } + + __global__ void atomicCAS_ulonglong_v8(long long* address, unsigned long long* result) { + *result = atomicCAS(address, 12, 13); + } + )"}; + +static constexpr auto kAtomicCAS_float{ + R"( + class Dummy { + public: + __device__ Dummy() {} + __device__ ~Dummy() {} + }; + + __global__ void atomicCAS_float_v1(float* address, float* result) { + *result = atomicCAS(&address, 12, 13); + } + + __global__ void atomicCAS_float_v2(float* address, float* result) { + *result = atomicCAS(address, address, 13); + } + + __global__ void atomicCAS_float_v3(float* address, float* result) { + *result = atomicCAS(address, 12, address); + } + + __global__ void atomicCAS_float_v4(Dummy* address, float* result) { + *result = atomicCAS(address, 12, 13); + } + + __global__ void atomicCAS_float_v5(char* address, float* result) { + *result = atomicCAS(address, 12, 13); + } + + __global__ void atomicCAS_float_v6(short* address, float* result) { + *result = atomicCAS(address, 12, 13); + } + + __global__ void atomicCAS_float_v7(long* address, float* result) { + *result = atomicCAS(address, 12, 13); + } + + __global__ void atomicCAS_float_v8(long long* address, float* result) { + *result = atomicCAS(address, 12, 13); + } + )"}; + +static constexpr auto kAtomicCAS_double{ + R"( + class Dummy { + public: + __device__ Dummy() {} + __device__ ~Dummy() {} + }; + + __global__ void atomicCAS_double_v1(double* address, double* result) { + *result = atomicCAS(&address, 12, 13); + } + + __global__ void atomicCAS_double_v2(double* address, double* result) { + *result = atomicCAS(address, address, 13); + } + + __global__ void atomicCAS_double_v3(double* address, double* result) { + *result = atomicCAS(address, 12, address); + } + + __global__ void atomicCAS_double_v4(Dummy* address, double* result) { + *result = atomicCAS(address, 12, 13); + } + + __global__ void atomicCAS_double_v5(char* address, double* result) { + *result = atomicCAS(address, 12, 13); + } + + __global__ void atomicCAS_double_v6(short* address, double* result) { + *result = atomicCAS(address, 12, 13); + } + + __global__ void atomicCAS_double_v7(long* address, double* result) { + *result = atomicCAS(address, 12, 13); + } + + __global__ void atomicCAS_double_v8(long long* address, double* result) { + *result = atomicCAS(address, 12, 13); + } + )"}; diff --git a/projects/hip-tests/catch/unit/atomics/atomicCAS_system.cc b/projects/hip-tests/catch/unit/atomics/atomicCAS_system.cc new file mode 100644 index 0000000000..8f2dd8306b --- /dev/null +++ b/projects/hip-tests/catch/unit/atomics/atomicCAS_system.cc @@ -0,0 +1,185 @@ +/* +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "arithmetic_common.hh" + +#include + +/** + * @addtogroup atomicCAS_system atomicCAS_system + * @{ + * @ingroup AtomicsTest + */ + +#ifdef HT_NVIDIA +#define TYPES +#else +#define TYPES , float, double +#endif + +/** + * Test Description + * ------------------------ + * - Executes a kernel two times concurrently on a two devices wherein all threads will perform + * an atomic addition, implemented using an atomic CAS operation, on a target memory location. Each + * thread will add the same value to the memory location, storing the return value into a separate + * output array slot corresponding to it. Once complete, the output array and target memory is + * validated to contain all the expected values. Several memory access patterns are tested: + * -# All threads exchange to a single, compile time deducible, memory location + * -# Each thread targets an array containing warp_size elements, using tid % warp_size + * for indexing + * -# Same as the above, but the elements are spread out by L1 cache line size bytes. + * + * - The test is run for: + * - All overloads of atomicCAS_system + * - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory + * - Several grid and block dimension combinations. + * Test source + * ------------------------ + * - unit/atomics/atomicCAS_system.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEMPLATE_TEST_CASE("Unit_atomicCAS_system_Positive_Peer_GPUs", "", int, unsigned int, + unsigned long long TYPES) { + int warp_size = 0; + HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0)); + const auto cache_line_size = 128u; + + for (auto current = 0; current < cmd_options.iterations; ++current) { + DYNAMIC_SECTION("Same address " << current) { + MultipleDeviceMultipleKernelAndHostTest( + 2, 2, 1, sizeof(TestType)); + } + + DYNAMIC_SECTION("Adjacent addresses " << current) { + MultipleDeviceMultipleKernelAndHostTest( + 2, 2, warp_size, sizeof(TestType)); + } + + DYNAMIC_SECTION("Scattered addresses " << current) { + MultipleDeviceMultipleKernelAndHostTest( + 2, 2, warp_size, cache_line_size); + } + } +} + +/** + * Test Description + * ------------------------ + * - Executes a kernel on a single device wherein all threads will perform + * an atomic addition, implemented using an atomic CAS operation, on a target memory location. + * Each thread will add the same value to the memory location, storing the return value into a + * separate output array slot corresponding to it. While the kernel is running, the host + * performs atomic additions, in 4 threads, on the same memory location(s). Once complete, the + * output array and target memory is validated to contain all the expected values. Several + * memory access patterns are tested: + * -# All threads exchange to a single, compile time deducible, memory location + * -# Each thread targets an array containing warp_size elements, using tid % warp_size + * for indexing + * -# Same as the above, but the elements are spread out by L1 cache line size bytes. + * + * - The test is run for: + * - All overloads of atomicCAS_system + * - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory + * - Several grid and block dimension combinations. + * Test source + * ------------------------ + * - unit/atomics/atomicCAS_system.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEMPLATE_TEST_CASE("Unit_atomicCAS_system_Positive_Host_And_GPU", "", int, unsigned int, + unsigned long long TYPES) { + int warp_size = 0; + HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0)); + const auto cache_line_size = 128u; + + for (auto current = 0; current < cmd_options.iterations; ++current) { + DYNAMIC_SECTION("Same address " << current) { + MultipleDeviceMultipleKernelAndHostTest( + 1, 1, 1, sizeof(TestType), 4); + } + + DYNAMIC_SECTION("Adjacent addresses " << current) { + MultipleDeviceMultipleKernelAndHostTest( + 1, 1, warp_size, sizeof(TestType), 4); + } + + DYNAMIC_SECTION("Scattered addresses " << current) { + MultipleDeviceMultipleKernelAndHostTest( + 1, 1, warp_size, cache_line_size, 4); + } + } +} + +/** + * Test Description + * ------------------------ + * - Executes a kernel two times on two devices wherein all threads will perform + * an atomic addition, implemented using an atomic CAS operation, on a target memory location. + * Each thread will add the same value to the memory location, storing the return value into a + * separate output array slot corresponding to it. While the kernel is running, the host + * performs atomic additions, in 4 threads, on the same memory location(s). Once complete, the + * output array and target memory is validated to contain all the expected values. Several + * memory access patterns are tested: + * -# All threads exchange to a single, compile time deducible, memory location + * -# Each thread targets an array containing warp_size elements, using tid % warp_size + * for indexing + * -# Same as the above, but the elements are spread out by L1 cache line size bytes. + * + * - The test is run for: + * - All overloads of atomicCAS_system + * - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory + * - Several grid and block dimension combinations. + * Test source + * ------------------------ + * - unit/atomics/atomicCAS_system.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEMPLATE_TEST_CASE("Unit_atomicCAS_system_Positive_Host_And_Peer_GPUs", "", int, unsigned int, + unsigned long long TYPES) { + int warp_size = 0; + HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0)); + const auto cache_line_size = 128u; + + for (auto current = 0; current < cmd_options.iterations; ++current) { + DYNAMIC_SECTION("Same address " << current) { + MultipleDeviceMultipleKernelAndHostTest( + 2, 2, 1, sizeof(TestType), 4); + } + + DYNAMIC_SECTION("Adjacent addresses " << current) { + MultipleDeviceMultipleKernelAndHostTest( + 2, 2, warp_size, sizeof(TestType), 4); + } + + DYNAMIC_SECTION("Scattered addresses " << current) { + MultipleDeviceMultipleKernelAndHostTest( + 2, 2, warp_size, cache_line_size, 4); + } + } +}