EXSWHTEC-275 - Implement tests for atomic CAS operations. (#197)
[ROCm/hip-tests commit: a8640d8c62]
This commit is contained in:
@@ -0,0 +1,42 @@
|
||||
# Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
set(TEST_SRC
|
||||
atomicCAS.cc
|
||||
atomicCAS_system.cc
|
||||
)
|
||||
|
||||
if(HIP_PLATFORM MATCHES "nvidia")
|
||||
set_source_files_properties(atomicCAS_system.cc PROPERTIES COMPILE_FLAGS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
|
||||
hip_add_exe_to_target(NAME AtomicsTest
|
||||
TEST_SRC ${TEST_SRC}
|
||||
TEST_TARGET_NAME build_tests
|
||||
LINKER_LIBS "nvrtc -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
|
||||
elseif(HIP_PLATFORM MATCHES "amd")
|
||||
hip_add_exe_to_target(NAME AtomicsTest
|
||||
TEST_SRC ${TEST_SRC}
|
||||
TEST_TARGET_NAME build_tests
|
||||
LINKER_LIBS hiprtc)
|
||||
endif()
|
||||
|
||||
add_test(NAME Unit_atomicCAS_Negative_Parameters
|
||||
COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../compileAndCaptureOutput.py
|
||||
${CMAKE_CURRENT_SOURCE_DIR} ${HIP_PLATFORM} ${HIP_PATH}
|
||||
atomicCAS_negative_kernels.cc 48)
|
||||
@@ -0,0 +1,449 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
#include <resource_guards.hh>
|
||||
#include <cmd_options.hh>
|
||||
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
enum class AtomicOperation {
|
||||
kAdd = 0,
|
||||
kAddSystem,
|
||||
kSub,
|
||||
kSubSystem,
|
||||
kInc,
|
||||
kDec,
|
||||
kUnsafeAdd,
|
||||
kSafeAdd,
|
||||
kCASAdd,
|
||||
kCASAddSystem
|
||||
};
|
||||
|
||||
constexpr auto kIntegerTestValue = 7;
|
||||
constexpr auto kFloatingPointTestValue = 3.125;
|
||||
constexpr auto kIncDecWraparoundValue = 1023;
|
||||
|
||||
template <typename TestType, AtomicOperation operation>
|
||||
__host__ __device__ TestType GetTestValue() {
|
||||
if constexpr (operation == AtomicOperation::kInc || operation == AtomicOperation::kDec) {
|
||||
return kIncDecWraparoundValue;
|
||||
}
|
||||
|
||||
return std::is_floating_point_v<TestType> ? kFloatingPointTestValue : kIntegerTestValue;
|
||||
}
|
||||
|
||||
template <typename TestType> __device__ TestType CASAtomicAdd(TestType* address, TestType val) {
|
||||
TestType old = *address, assumed;
|
||||
|
||||
do {
|
||||
assumed = old;
|
||||
old = atomicCAS(address, assumed, val + assumed);
|
||||
} while (assumed != old);
|
||||
|
||||
return old;
|
||||
}
|
||||
|
||||
template <typename TestType>
|
||||
__device__ TestType CASAtomicAddSystem(TestType* address, TestType val) {
|
||||
TestType old = *address, assumed;
|
||||
|
||||
do {
|
||||
assumed = old;
|
||||
old = atomicCAS_system(address, assumed, val + assumed);
|
||||
} while (assumed != old);
|
||||
|
||||
return old;
|
||||
}
|
||||
|
||||
template <typename TestType, AtomicOperation operation>
|
||||
__device__ TestType PerformAtomicOperation(TestType* const mem) {
|
||||
const auto val = GetTestValue<TestType, operation>();
|
||||
|
||||
if constexpr (operation == AtomicOperation::kAdd) {
|
||||
return atomicAdd(mem, val);
|
||||
} else if constexpr (operation == AtomicOperation::kAddSystem) {
|
||||
return atomicAdd_system(mem, val);
|
||||
} else if constexpr (operation == AtomicOperation::kSub) {
|
||||
return atomicSub(mem, val);
|
||||
} else if constexpr (operation == AtomicOperation::kSubSystem) {
|
||||
return atomicSub_system(mem, val);
|
||||
} else if constexpr (operation == AtomicOperation::kInc) {
|
||||
return atomicInc(mem, val);
|
||||
} else if constexpr (operation == AtomicOperation::kDec) {
|
||||
return atomicDec(mem, val);
|
||||
} else if constexpr (operation == AtomicOperation::kUnsafeAdd) {
|
||||
return unsafeAtomicAdd(mem, val);
|
||||
} else if constexpr (operation == AtomicOperation::kSafeAdd) {
|
||||
return safeAtomicAdd(mem, val);
|
||||
} else if constexpr (operation == AtomicOperation::kCASAdd) {
|
||||
return CASAtomicAdd(mem, val);
|
||||
} else if constexpr (operation == AtomicOperation::kCASAddSystem) {
|
||||
return CASAtomicAddSystem(mem, val);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TestType, AtomicOperation operation, bool use_shared_mem>
|
||||
__global__ void TestKernel(TestType* const global_mem, TestType* const old_vals) {
|
||||
__shared__ TestType shared_mem;
|
||||
|
||||
const auto tid = cg::this_grid().thread_rank();
|
||||
|
||||
TestType* const mem = use_shared_mem ? &shared_mem : global_mem;
|
||||
|
||||
if constexpr (use_shared_mem) {
|
||||
if (tid == 0) mem[0] = global_mem[0];
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
old_vals[tid] = PerformAtomicOperation<TestType, operation>(mem);
|
||||
|
||||
if constexpr (use_shared_mem) {
|
||||
__syncthreads();
|
||||
if (tid == 0) global_mem[0] = mem[0];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TestType>
|
||||
__host__ __device__ TestType* PitchedOffset(TestType* const ptr, const unsigned int pitch,
|
||||
const unsigned int idx) {
|
||||
const auto byte_ptr = reinterpret_cast<uint8_t*>(ptr);
|
||||
return reinterpret_cast<TestType*>(byte_ptr + idx * pitch);
|
||||
}
|
||||
|
||||
template <typename TestType, AtomicOperation operation, bool use_shared_mem>
|
||||
__global__ void TestKernel(TestType* const global_mem, TestType* const old_vals,
|
||||
const unsigned int width, const unsigned pitch) {
|
||||
extern __shared__ uint8_t shared_mem[];
|
||||
|
||||
const auto tid = cg::this_grid().thread_rank();
|
||||
|
||||
TestType* const mem = use_shared_mem ? reinterpret_cast<TestType*>(shared_mem) : global_mem;
|
||||
|
||||
if constexpr (use_shared_mem) {
|
||||
if (tid < width) {
|
||||
const auto target = PitchedOffset(mem, pitch, tid);
|
||||
*target = *PitchedOffset(global_mem, pitch, tid);
|
||||
};
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
old_vals[tid] =
|
||||
PerformAtomicOperation<TestType, operation>(PitchedOffset(mem, pitch, tid % width));
|
||||
|
||||
if constexpr (use_shared_mem) {
|
||||
__syncthreads();
|
||||
if (tid < width) {
|
||||
const auto target = PitchedOffset(global_mem, pitch, tid);
|
||||
*target = *PitchedOffset(mem, pitch, tid);
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
struct TestParams {
|
||||
auto ThreadCount() const {
|
||||
return blocks.x * blocks.y * blocks.z * threads.x * threads.y * threads.z;
|
||||
}
|
||||
|
||||
auto HostIterationsPerThread() const {
|
||||
return std::max(num_devices * kernel_count * ThreadCount() / 20, width);
|
||||
}
|
||||
|
||||
dim3 blocks;
|
||||
dim3 threads;
|
||||
unsigned int num_devices = 1u;
|
||||
unsigned int kernel_count = 1u;
|
||||
unsigned int width = 1u;
|
||||
unsigned int pitch = 0u;
|
||||
unsigned int host_thread_count = 0u;
|
||||
LinearAllocs alloc_type;
|
||||
};
|
||||
|
||||
template <typename TestType, AtomicOperation operation>
|
||||
std::tuple<std::vector<TestType>, std::vector<TestType>> TestKernelHostRef(const TestParams& p) {
|
||||
const auto val = GetTestValue<TestType, operation>();
|
||||
|
||||
const auto total_thread_count = p.num_devices * p.kernel_count * p.ThreadCount() +
|
||||
p.host_thread_count * p.HostIterationsPerThread();
|
||||
|
||||
std::vector<TestType> res_vals(p.width);
|
||||
std::vector<TestType> old_vals;
|
||||
old_vals.reserve(total_thread_count);
|
||||
|
||||
auto perform_op = [&](unsigned id) {
|
||||
auto& res = res_vals[id % p.width];
|
||||
old_vals.push_back(res);
|
||||
|
||||
if constexpr (operation == AtomicOperation::kAdd || operation == AtomicOperation::kAddSystem ||
|
||||
operation == AtomicOperation::kUnsafeAdd ||
|
||||
operation == AtomicOperation::kSafeAdd || operation == AtomicOperation::kCASAdd ||
|
||||
operation == AtomicOperation::kCASAddSystem) {
|
||||
res = res + val;
|
||||
} else if constexpr (operation == AtomicOperation::kSub ||
|
||||
operation == AtomicOperation::kSubSystem) {
|
||||
res = res - val;
|
||||
} else if constexpr (operation == AtomicOperation::kInc) {
|
||||
res = (res >= val) ? 0 : res + 1;
|
||||
} else if constexpr (operation == AtomicOperation::kDec) {
|
||||
res = ((res == 0) || (res > val)) ? val : res - 1;
|
||||
}
|
||||
};
|
||||
|
||||
for (auto i = 0u; i < p.num_devices; ++i) {
|
||||
for (auto j = 0u; j < p.kernel_count; ++j) {
|
||||
for (auto tid = 0u; tid < p.ThreadCount(); ++tid) {
|
||||
perform_op(tid);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (auto i = 0u; i < p.host_thread_count; ++i) {
|
||||
for (auto j = 0u; j < p.HostIterationsPerThread(); ++j) {
|
||||
perform_op(j);
|
||||
}
|
||||
}
|
||||
|
||||
return {res_vals, old_vals};
|
||||
}
|
||||
|
||||
template <typename TestType, AtomicOperation operation>
|
||||
void Verify(const TestParams& p, std::vector<TestType>& res_vals, std::vector<TestType>& old_vals) {
|
||||
auto [expected_res_vals, expected_old_vals] = TestKernelHostRef<TestType, operation>(p);
|
||||
|
||||
for (auto i = 0u; i < res_vals.size(); ++i) {
|
||||
INFO("Results index: " << i);
|
||||
REQUIRE(expected_res_vals[i] == res_vals[i]);
|
||||
}
|
||||
|
||||
std::sort(begin(old_vals), end(old_vals));
|
||||
std::sort(begin(expected_old_vals), end(expected_old_vals));
|
||||
for (auto i = 0u; i < old_vals.size(); ++i) {
|
||||
INFO("Old values index: " << i);
|
||||
REQUIRE(expected_old_vals[i] == old_vals[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TestType, AtomicOperation operation, bool use_shared_mem>
|
||||
void LaunchKernel(const TestParams& p, hipStream_t stream, TestType* const mem_ptr,
|
||||
TestType* const old_vals) {
|
||||
const auto shared_mem_size = use_shared_mem ? p.width * p.pitch : 0u;
|
||||
if (p.width == 1 && p.pitch == sizeof(TestType))
|
||||
TestKernel<TestType, operation, use_shared_mem>
|
||||
<<<p.blocks, p.threads, shared_mem_size, stream>>>(mem_ptr, old_vals);
|
||||
else
|
||||
TestKernel<TestType, operation, use_shared_mem>
|
||||
<<<p.blocks, p.threads, shared_mem_size, stream>>>(mem_ptr, old_vals, p.width, p.pitch);
|
||||
}
|
||||
|
||||
template <typename TestType, AtomicOperation operation>
|
||||
void HostAtomicOperation(const unsigned int iterations, TestType* mem, TestType* const old_vals,
|
||||
const unsigned int width, const unsigned pitch, TestType base_val) {
|
||||
const auto val = GetTestValue<TestType, operation>();
|
||||
|
||||
for (auto i = 0u; i < iterations; ++i) {
|
||||
if constexpr (operation == AtomicOperation::kAddSystem ||
|
||||
operation == AtomicOperation::kCASAddSystem) {
|
||||
old_vals[i] = __atomic_fetch_add(PitchedOffset(mem, pitch, i % width), val, __ATOMIC_RELAXED);
|
||||
} else if constexpr (operation == AtomicOperation::kSubSystem) {
|
||||
old_vals[i] = __atomic_fetch_sub(PitchedOffset(mem, pitch, i % width), val, __ATOMIC_RELAXED);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TestType, AtomicOperation operation>
|
||||
void PerformHostAtomicOperation(const TestParams& p, TestType* mem, TestType* const old_vals) {
|
||||
if (p.host_thread_count == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const auto host_base_val = p.num_devices * p.kernel_count * p.ThreadCount();
|
||||
|
||||
std::vector<std::thread> threads;
|
||||
for (auto i = 0u; i < p.host_thread_count; ++i) {
|
||||
const auto iterations = p.HostIterationsPerThread();
|
||||
const auto thread_base_val = host_base_val + i * iterations;
|
||||
threads.push_back(std::thread(HostAtomicOperation<TestType, operation>, iterations, mem,
|
||||
old_vals + thread_base_val, p.width, p.pitch, thread_base_val));
|
||||
}
|
||||
|
||||
for (auto& th : threads) {
|
||||
th.join();
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TestType, AtomicOperation operation, bool use_shared_mem>
|
||||
void TestCore(const TestParams& p) {
|
||||
const unsigned int flags =
|
||||
p.alloc_type == LinearAllocs::mallocAndRegister ? hipHostRegisterMapped : 0u;
|
||||
|
||||
const auto old_vals_alloc_size = p.kernel_count * p.ThreadCount() * sizeof(TestType);
|
||||
std::vector<LinearAllocGuard<TestType>> old_vals_devs;
|
||||
std::vector<StreamGuard> streams;
|
||||
for (auto i = 0; i < p.num_devices; ++i) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
old_vals_devs.emplace_back(LinearAllocs::hipMalloc, old_vals_alloc_size);
|
||||
for (auto j = 0; j < p.kernel_count; ++j) {
|
||||
streams.emplace_back(Streams::created);
|
||||
}
|
||||
}
|
||||
|
||||
const auto mem_alloc_size = p.width * p.pitch;
|
||||
LinearAllocGuard<TestType> mem_dev(p.alloc_type, mem_alloc_size, flags);
|
||||
|
||||
std::vector<TestType> old_vals(p.num_devices * p.kernel_count * p.ThreadCount() +
|
||||
p.host_thread_count * p.HostIterationsPerThread());
|
||||
std::vector<TestType> res_vals(p.width);
|
||||
|
||||
TestType* const mem_ptr =
|
||||
p.alloc_type == LinearAllocs::hipMalloc ? mem_dev.ptr() : mem_dev.host_ptr();
|
||||
|
||||
HIP_CHECK(hipMemset(mem_ptr, 0, mem_alloc_size));
|
||||
|
||||
for (auto i = 0u; i < p.num_devices; ++i) {
|
||||
for (auto j = 0u; j < p.kernel_count; ++j) {
|
||||
const auto& stream = streams[i * p.kernel_count + j].stream();
|
||||
const auto old_vals = old_vals_devs[i].ptr() + j * p.ThreadCount();
|
||||
LaunchKernel<TestType, operation, use_shared_mem>(p, stream, mem_dev.ptr(), old_vals);
|
||||
}
|
||||
}
|
||||
|
||||
PerformHostAtomicOperation<TestType, operation>(p, mem_dev.host_ptr(), old_vals.data());
|
||||
|
||||
for (auto i = 0u; i < p.num_devices; ++i) {
|
||||
const auto device_offset = i * p.kernel_count * p.ThreadCount();
|
||||
HIP_CHECK(hipMemcpy(old_vals.data() + device_offset, old_vals_devs[i].ptr(),
|
||||
old_vals_alloc_size, hipMemcpyDeviceToHost));
|
||||
}
|
||||
HIP_CHECK(hipMemcpy2D(res_vals.data(), sizeof(TestType), mem_ptr, p.pitch, sizeof(TestType),
|
||||
p.width, hipMemcpyDeviceToHost));
|
||||
|
||||
Verify<TestType, operation>(p, res_vals, old_vals);
|
||||
}
|
||||
|
||||
inline dim3 GenerateThreadDimensions() { return GENERATE(dim3(16), dim3(1024)); }
|
||||
|
||||
inline dim3 GenerateBlockDimensions() {
|
||||
int sm_count = 0;
|
||||
HIP_CHECK(hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, 0));
|
||||
return GENERATE_COPY(dim3(sm_count), dim3(sm_count + sm_count / 2));
|
||||
}
|
||||
|
||||
template <typename TestType, AtomicOperation operation>
|
||||
void SingleDeviceSingleKernelTest(const unsigned int width, const unsigned int pitch) {
|
||||
TestParams params;
|
||||
params.num_devices = 1;
|
||||
params.kernel_count = 1;
|
||||
params.threads = GenerateThreadDimensions();
|
||||
params.width = width;
|
||||
params.pitch = pitch;
|
||||
|
||||
SECTION("Global memory") {
|
||||
params.blocks = GenerateBlockDimensions();
|
||||
using LA = LinearAllocs;
|
||||
for (const auto alloc_type :
|
||||
{LA::hipMalloc, LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) {
|
||||
params.alloc_type = alloc_type;
|
||||
DYNAMIC_SECTION("Allocation type: " << to_string(alloc_type)) {
|
||||
TestCore<TestType, operation, false>(params);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
SECTION("Shared memory") {
|
||||
params.blocks = dim3(1);
|
||||
params.alloc_type = LinearAllocs::hipMalloc;
|
||||
TestCore<TestType, operation, true>(params);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TestType, AtomicOperation operation>
|
||||
void SingleDeviceMultipleKernelTest(const unsigned int kernel_count, const unsigned int width,
|
||||
const unsigned int pitch) {
|
||||
int concurrent_kernels = 0;
|
||||
HIP_CHECK(hipDeviceGetAttribute(&concurrent_kernels, hipDeviceAttributeConcurrentKernels, 0));
|
||||
if (!concurrent_kernels) {
|
||||
HipTest::HIP_SKIP_TEST("Test requires support for concurrent kernel execution");
|
||||
return;
|
||||
}
|
||||
|
||||
TestParams params;
|
||||
params.num_devices = 1;
|
||||
params.kernel_count = kernel_count;
|
||||
params.blocks = GenerateBlockDimensions();
|
||||
params.threads = GenerateThreadDimensions();
|
||||
params.width = width;
|
||||
params.pitch = pitch;
|
||||
|
||||
using LA = LinearAllocs;
|
||||
for (const auto alloc_type :
|
||||
{LA::hipMalloc, LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) {
|
||||
params.alloc_type = alloc_type;
|
||||
DYNAMIC_SECTION("Allocation type: " << to_string(alloc_type)) {
|
||||
TestCore<TestType, operation, false>(params);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TestType, AtomicOperation operation>
|
||||
void MultipleDeviceMultipleKernelAndHostTest(const unsigned int num_devices,
|
||||
const unsigned int kernel_count,
|
||||
const unsigned int width, const unsigned int pitch,
|
||||
const unsigned int host_thread_count = 0u) {
|
||||
if (num_devices > 1) {
|
||||
if (HipTest::getDeviceCount() < num_devices) {
|
||||
std::string msg = std::to_string(num_devices) + " devices are required";
|
||||
HipTest::HIP_SKIP_TEST(msg.c_str());
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (kernel_count > 1) {
|
||||
for (auto i = 0u; i < num_devices; ++i) {
|
||||
int concurrent_kernels = 0;
|
||||
HIP_CHECK(hipDeviceGetAttribute(&concurrent_kernels, hipDeviceAttributeConcurrentKernels, i));
|
||||
if (!concurrent_kernels) {
|
||||
HipTest::HIP_SKIP_TEST("Test requires support for concurrent kernel execution");
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TestParams params;
|
||||
params.num_devices = num_devices;
|
||||
params.kernel_count = kernel_count;
|
||||
params.blocks = GenerateBlockDimensions();
|
||||
params.threads = GenerateThreadDimensions();
|
||||
params.width = width;
|
||||
params.pitch = pitch;
|
||||
params.host_thread_count = host_thread_count;
|
||||
|
||||
using LA = LinearAllocs;
|
||||
for (const auto alloc_type : {LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) {
|
||||
params.alloc_type = alloc_type;
|
||||
DYNAMIC_SECTION("Allocation type: " << to_string(alloc_type)) {
|
||||
TestCore<TestType, operation, false>(params);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,172 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "arithmetic_common.hh"
|
||||
#include "atomicCAS_negative_kernels_rtc.hh"
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
|
||||
/**
|
||||
* @addtogroup atomicCAS atomicCAS
|
||||
* @{
|
||||
* @ingroup AtomicsTest
|
||||
*/
|
||||
|
||||
#ifdef HT_NVIDIA
|
||||
#define TYPES
|
||||
#else
|
||||
#define TYPES , float, double
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes a single kernel on a single device wherein all threads will perform an atomic
|
||||
* addition, implemented using an atomic CAS operation, on a target memory location. Each thread
|
||||
* will add the same value to the memory location, storing the return value into a separate output
|
||||
* array slot corresponding to it. Once complete, the output array and target memory is validated to
|
||||
* contain all the expected values. Several memory access patterns are tested:
|
||||
* -# All threads exchange to a single, compile time deducible, memory location
|
||||
* -# Each thread targets an array containing warp_size elements, using tid % warp_size
|
||||
* for indexing
|
||||
* -# Same as the above, but the elements are spread out by L1 cache line size bytes.
|
||||
*
|
||||
* - The test is run for:
|
||||
* - All overloads of atomicCAS
|
||||
* - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
|
||||
* - Shared memory
|
||||
* - Several grid and block dimension combinations (only one block is used for shared memory).
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - unit/atomics/atomicCAS.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEMPLATE_TEST_CASE("Unit_atomicCAS_Positive", "", int, unsigned int, unsigned long long TYPES) {
|
||||
int warp_size = 0;
|
||||
HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
|
||||
const auto cache_line_size = 128u;
|
||||
|
||||
for (auto current = 0; current < cmd_options.iterations; ++current) {
|
||||
DYNAMIC_SECTION("Same address " << current) {
|
||||
SingleDeviceSingleKernelTest<TestType, AtomicOperation::kCASAdd>(1, sizeof(TestType));
|
||||
}
|
||||
|
||||
DYNAMIC_SECTION("Adjacent addresses " << current) {
|
||||
SingleDeviceSingleKernelTest<TestType, AtomicOperation::kCASAdd>(warp_size, sizeof(TestType));
|
||||
}
|
||||
|
||||
DYNAMIC_SECTION("Scattered addresses " << current) {
|
||||
SingleDeviceSingleKernelTest<TestType, AtomicOperation::kCASAdd>(warp_size, cache_line_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes a kernel two times concurrently on a single device wherein all threads will perform
|
||||
* an atomic addition, implemented using an atomic CAS operation, on a target memory location. Each
|
||||
* thread will add the same value to the memory location, storing the return value into a separate
|
||||
* output array slot corresponding to it. Once complete, the output array and target memory is
|
||||
* validated to contain all the expected values. Several memory access patterns are tested:
|
||||
* -# All threads exchange to a single, compile time deducible, memory location
|
||||
* -# Each thread targets an array containing warp_size elements, using tid % warp_size
|
||||
* for indexing
|
||||
* -# Same as the above, but the elements are spread out by L1 cache line size bytes.
|
||||
*
|
||||
* - The test is run for:
|
||||
* - All overloads of atomicCAS
|
||||
* - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
|
||||
* - Several grid and block dimension combinations.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - unit/atomics/atomicCAS.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEMPLATE_TEST_CASE("Unit_atomicCAS_Positive_Multi_Kernel", "", int, unsigned int,
|
||||
unsigned long long TYPES) {
|
||||
int warp_size = 0;
|
||||
HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
|
||||
const auto cache_line_size = 128u;
|
||||
|
||||
for (auto current = 0; current < cmd_options.iterations; ++current) {
|
||||
DYNAMIC_SECTION("Same address " << current) {
|
||||
SingleDeviceMultipleKernelTest<TestType, AtomicOperation::kCASAdd>(2, 1, sizeof(TestType));
|
||||
}
|
||||
|
||||
DYNAMIC_SECTION("Adjacent addresses " << current) {
|
||||
SingleDeviceMultipleKernelTest<TestType, AtomicOperation::kCASAdd>(2, warp_size,
|
||||
sizeof(TestType));
|
||||
}
|
||||
|
||||
DYNAMIC_SECTION("Scattered addresses " << current) {
|
||||
SingleDeviceMultipleKernelTest<TestType, AtomicOperation::kCASAdd>(2, warp_size,
|
||||
cache_line_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - RTCs kernels that pass combinations of arguments of invalid types for all overloads of
|
||||
* atomicCAS.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - unit/atomics/atomicCAS.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Unit_atomicCAS_Negative_Parameters_RTC") {
|
||||
hiprtcProgram program{};
|
||||
|
||||
const auto program_source = GENERATE(kAtomicCAS_int, kAtomicCAS_uint, kAtomicCAS_ulong,
|
||||
kAtomicCAS_ulonglong, kAtomicCAS_float, kAtomicCAS_double);
|
||||
HIPRTC_CHECK(
|
||||
hiprtcCreateProgram(&program, program_source, "atomicCAS_negative.cc", 0, nullptr, nullptr));
|
||||
hiprtcResult result{hiprtcCompileProgram(program, 0, nullptr)};
|
||||
|
||||
// Get the compile log and count compiler error messages
|
||||
size_t log_size{};
|
||||
HIPRTC_CHECK(hiprtcGetProgramLogSize(program, &log_size));
|
||||
std::string log(log_size, ' ');
|
||||
HIPRTC_CHECK(hiprtcGetProgramLog(program, log.data()));
|
||||
int error_count{0};
|
||||
|
||||
int expected_error_count{8};
|
||||
std::string error_message{"error:"};
|
||||
|
||||
size_t n_pos = log.find(error_message, 0);
|
||||
while (n_pos != std::string::npos) {
|
||||
++error_count;
|
||||
n_pos = log.find(error_message, n_pos + 1);
|
||||
}
|
||||
|
||||
HIPRTC_CHECK(hiprtcDestroyProgram(&program));
|
||||
HIPRTC_CHECK_ERROR(result, HIPRTC_ERROR_COMPILATION);
|
||||
REQUIRE(error_count == expected_error_count);
|
||||
}
|
||||
@@ -0,0 +1,62 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
|
||||
class Dummy {
|
||||
public:
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
|
||||
#define ATOMIC_CAS_NEGATIVE_KERNEL(type_name) \
|
||||
__global__ void atomicCAS_v1(type_name* address, type_name* result) { \
|
||||
*result = atomicCAS(&address, 12, 13); \
|
||||
} \
|
||||
__global__ void atomicCAS_v2(type_name* address, type_name* result) { \
|
||||
*result = atomicCAS(address, address, 13); \
|
||||
} \
|
||||
__global__ void atomicCAS_v3(type_name* address, type_name* result) { \
|
||||
*result = atomicCAS(address, 12, address); \
|
||||
} \
|
||||
__global__ void atomicCAS_v4(Dummy* address, type_name* result) { \
|
||||
*result = atomicCAS(address, 12, 13); \
|
||||
} \
|
||||
__global__ void atomicCAS_v5(char* address, type_name* result) { \
|
||||
*result = atomicCAS(address, 12, 13); \
|
||||
} \
|
||||
__global__ void atomicCAS_v6(short* address, type_name* result) { \
|
||||
*result = atomicCAS(address, 12, 13); \
|
||||
} \
|
||||
__global__ void atomicCAS_v7(long* address, type_name* result) { \
|
||||
*result = atomicCAS(address, 12, 13); \
|
||||
} \
|
||||
__global__ void atomicCAS_v8(long long* address, type_name* result) { \
|
||||
*result = atomicCAS(address, 12, 13); \
|
||||
}
|
||||
|
||||
ATOMIC_CAS_NEGATIVE_KERNEL(int)
|
||||
ATOMIC_CAS_NEGATIVE_KERNEL(unsigned int)
|
||||
ATOMIC_CAS_NEGATIVE_KERNEL(unsigned long)
|
||||
ATOMIC_CAS_NEGATIVE_KERNEL(unsigned long long)
|
||||
ATOMIC_CAS_NEGATIVE_KERNEL(float)
|
||||
ATOMIC_CAS_NEGATIVE_KERNEL(double)
|
||||
@@ -0,0 +1,273 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
/*
|
||||
Negative kernels used for the atomics negative Test Cases that are using RTC.
|
||||
*/
|
||||
|
||||
static constexpr auto kAtomicCAS_int{
|
||||
R"(
|
||||
class Dummy {
|
||||
public:
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
|
||||
__global__ void atomicCAS_int_v1(int* address, int* result) {
|
||||
*result = atomicCAS(&address, 12, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_int_v2(int* address, int* result) {
|
||||
*result = atomicCAS(address, address, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_int_v3(int* address, int* result) {
|
||||
*result = atomicCAS(address, 12, address);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_int_v4(Dummy* address, int* result) {
|
||||
*result = atomicCAS(address, 12, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_int_v5(char* address, int* result) {
|
||||
*result = atomicCAS(address, 12, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_int_v6(short* address, int* result) {
|
||||
*result = atomicCAS(address, 12, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_int_v7(long* address, int* result) {
|
||||
*result = atomicCAS(address, 12, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_int_v8(long long* address, int* result) {
|
||||
*result = atomicCAS(address, 12, 13);
|
||||
}
|
||||
)"};
|
||||
|
||||
static constexpr auto kAtomicCAS_uint{
|
||||
R"(
|
||||
class Dummy {
|
||||
public:
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
|
||||
__global__ void atomicCAS_uint_v1(unsigned int* address, unsigned int* result) {
|
||||
*result = atomicCAS(&address, 12, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_uint_v2(unsigned int* address, unsigned int* result) {
|
||||
*result = atomicCAS(address, address, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_uint_v3(unsigned int* address, unsigned int* result) {
|
||||
*result = atomicCAS(address, 12, address);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_uint_v4(Dummy* address, unsigned int* result) {
|
||||
*result = atomicCAS(address, 12, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_uint_v5(char* address, unsigned int* result) {
|
||||
*result = atomicCAS(address, 12, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_uint_v6(short* address, unsigned int* result) {
|
||||
*result = atomicCAS(address, 12, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_uint_v7(long* address, unsigned int* result) {
|
||||
*result = atomicCAS(address, 12, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_uint_v8(long long* address, unsigned int* result) {
|
||||
*result = atomicCAS(address, 12, 13);
|
||||
}
|
||||
)"};
|
||||
|
||||
static constexpr auto kAtomicCAS_ulong{
|
||||
R"(
|
||||
class Dummy {
|
||||
public:
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
|
||||
__global__ void atomicCAS_ulong_v1(unsigned long* address, unsigned long* result) {
|
||||
*result = atomicCAS(&address, 12, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_ulong_v2(unsigned long* address, unsigned long* result) {
|
||||
*result = atomicCAS(address, address, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_ulong_v3(unsigned long* address, unsigned long* result) {
|
||||
*result = atomicCAS(address, 12, address);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_ulong_v4(Dummy* address, unsigned long* result) {
|
||||
*result = atomicCAS(address, 12, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_ulong_v5(char* address, unsigned long* result) {
|
||||
*result = atomicCAS(address, 12, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_ulong_v6(short* address, unsigned long* result) {
|
||||
*result = atomicCAS(address, 12, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_ulong_v7(long* address, unsigned long* result) {
|
||||
*result = atomicCAS(address, 12, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_ulong_v8(long long* address, unsigned long* result) {
|
||||
*result = atomicCAS(address, 12, 13);
|
||||
}
|
||||
)"};
|
||||
|
||||
static constexpr auto kAtomicCAS_ulonglong{
|
||||
R"(
|
||||
class Dummy {
|
||||
public:
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
|
||||
__global__ void atomicCAS_ulonglong_v1(unsigned long long* address, unsigned long long* result) {
|
||||
*result = atomicCAS(&address, 12, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_ulonglong_v2(unsigned long long* address, unsigned long long* result) {
|
||||
*result = atomicCAS(address, address, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_ulonglong_v3(unsigned long long* address, unsigned long long* result) {
|
||||
*result = atomicCAS(address, 12, address);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_ulonglong_v4(Dummy* address, unsigned long long* result) {
|
||||
*result = atomicCAS(address, 12, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_ulonglong_v5(char* address, unsigned long long* result) {
|
||||
*result = atomicCAS(address, 12, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_ulonglong_v6(short* address, unsigned long long* result) {
|
||||
*result = atomicCAS(address, 12, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_ulonglong_v7(long* address, unsigned long long* result) {
|
||||
*result = atomicCAS(address, 12, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_ulonglong_v8(long long* address, unsigned long long* result) {
|
||||
*result = atomicCAS(address, 12, 13);
|
||||
}
|
||||
)"};
|
||||
|
||||
static constexpr auto kAtomicCAS_float{
|
||||
R"(
|
||||
class Dummy {
|
||||
public:
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
|
||||
__global__ void atomicCAS_float_v1(float* address, float* result) {
|
||||
*result = atomicCAS(&address, 12, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_float_v2(float* address, float* result) {
|
||||
*result = atomicCAS(address, address, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_float_v3(float* address, float* result) {
|
||||
*result = atomicCAS(address, 12, address);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_float_v4(Dummy* address, float* result) {
|
||||
*result = atomicCAS(address, 12, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_float_v5(char* address, float* result) {
|
||||
*result = atomicCAS(address, 12, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_float_v6(short* address, float* result) {
|
||||
*result = atomicCAS(address, 12, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_float_v7(long* address, float* result) {
|
||||
*result = atomicCAS(address, 12, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_float_v8(long long* address, float* result) {
|
||||
*result = atomicCAS(address, 12, 13);
|
||||
}
|
||||
)"};
|
||||
|
||||
static constexpr auto kAtomicCAS_double{
|
||||
R"(
|
||||
class Dummy {
|
||||
public:
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
|
||||
__global__ void atomicCAS_double_v1(double* address, double* result) {
|
||||
*result = atomicCAS(&address, 12, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_double_v2(double* address, double* result) {
|
||||
*result = atomicCAS(address, address, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_double_v3(double* address, double* result) {
|
||||
*result = atomicCAS(address, 12, address);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_double_v4(Dummy* address, double* result) {
|
||||
*result = atomicCAS(address, 12, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_double_v5(char* address, double* result) {
|
||||
*result = atomicCAS(address, 12, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_double_v6(short* address, double* result) {
|
||||
*result = atomicCAS(address, 12, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_double_v7(long* address, double* result) {
|
||||
*result = atomicCAS(address, 12, 13);
|
||||
}
|
||||
|
||||
__global__ void atomicCAS_double_v8(long long* address, double* result) {
|
||||
*result = atomicCAS(address, 12, 13);
|
||||
}
|
||||
)"};
|
||||
@@ -0,0 +1,185 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "arithmetic_common.hh"
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
|
||||
/**
|
||||
* @addtogroup atomicCAS_system atomicCAS_system
|
||||
* @{
|
||||
* @ingroup AtomicsTest
|
||||
*/
|
||||
|
||||
#ifdef HT_NVIDIA
|
||||
#define TYPES
|
||||
#else
|
||||
#define TYPES , float, double
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes a kernel two times concurrently on a two devices wherein all threads will perform
|
||||
* an atomic addition, implemented using an atomic CAS operation, on a target memory location. Each
|
||||
* thread will add the same value to the memory location, storing the return value into a separate
|
||||
* output array slot corresponding to it. Once complete, the output array and target memory is
|
||||
* validated to contain all the expected values. Several memory access patterns are tested:
|
||||
* -# All threads exchange to a single, compile time deducible, memory location
|
||||
* -# Each thread targets an array containing warp_size elements, using tid % warp_size
|
||||
* for indexing
|
||||
* -# Same as the above, but the elements are spread out by L1 cache line size bytes.
|
||||
*
|
||||
* - The test is run for:
|
||||
* - All overloads of atomicCAS_system
|
||||
* - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
|
||||
* - Several grid and block dimension combinations.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - unit/atomics/atomicCAS_system.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEMPLATE_TEST_CASE("Unit_atomicCAS_system_Positive_Peer_GPUs", "", int, unsigned int,
|
||||
unsigned long long TYPES) {
|
||||
int warp_size = 0;
|
||||
HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
|
||||
const auto cache_line_size = 128u;
|
||||
|
||||
for (auto current = 0; current < cmd_options.iterations; ++current) {
|
||||
DYNAMIC_SECTION("Same address " << current) {
|
||||
MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kCASAddSystem>(
|
||||
2, 2, 1, sizeof(TestType));
|
||||
}
|
||||
|
||||
DYNAMIC_SECTION("Adjacent addresses " << current) {
|
||||
MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kCASAddSystem>(
|
||||
2, 2, warp_size, sizeof(TestType));
|
||||
}
|
||||
|
||||
DYNAMIC_SECTION("Scattered addresses " << current) {
|
||||
MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kCASAddSystem>(
|
||||
2, 2, warp_size, cache_line_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes a kernel on a single device wherein all threads will perform
|
||||
* an atomic addition, implemented using an atomic CAS operation, on a target memory location.
|
||||
* Each thread will add the same value to the memory location, storing the return value into a
|
||||
* separate output array slot corresponding to it. While the kernel is running, the host
|
||||
* performs atomic additions, in 4 threads, on the same memory location(s). Once complete, the
|
||||
* output array and target memory is validated to contain all the expected values. Several
|
||||
* memory access patterns are tested:
|
||||
* -# All threads exchange to a single, compile time deducible, memory location
|
||||
* -# Each thread targets an array containing warp_size elements, using tid % warp_size
|
||||
* for indexing
|
||||
* -# Same as the above, but the elements are spread out by L1 cache line size bytes.
|
||||
*
|
||||
* - The test is run for:
|
||||
* - All overloads of atomicCAS_system
|
||||
* - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
|
||||
* - Several grid and block dimension combinations.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - unit/atomics/atomicCAS_system.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEMPLATE_TEST_CASE("Unit_atomicCAS_system_Positive_Host_And_GPU", "", int, unsigned int,
|
||||
unsigned long long TYPES) {
|
||||
int warp_size = 0;
|
||||
HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
|
||||
const auto cache_line_size = 128u;
|
||||
|
||||
for (auto current = 0; current < cmd_options.iterations; ++current) {
|
||||
DYNAMIC_SECTION("Same address " << current) {
|
||||
MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kCASAddSystem>(
|
||||
1, 1, 1, sizeof(TestType), 4);
|
||||
}
|
||||
|
||||
DYNAMIC_SECTION("Adjacent addresses " << current) {
|
||||
MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kCASAddSystem>(
|
||||
1, 1, warp_size, sizeof(TestType), 4);
|
||||
}
|
||||
|
||||
DYNAMIC_SECTION("Scattered addresses " << current) {
|
||||
MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kCASAddSystem>(
|
||||
1, 1, warp_size, cache_line_size, 4);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes a kernel two times on two devices wherein all threads will perform
|
||||
* an atomic addition, implemented using an atomic CAS operation, on a target memory location.
|
||||
* Each thread will add the same value to the memory location, storing the return value into a
|
||||
* separate output array slot corresponding to it. While the kernel is running, the host
|
||||
* performs atomic additions, in 4 threads, on the same memory location(s). Once complete, the
|
||||
* output array and target memory is validated to contain all the expected values. Several
|
||||
* memory access patterns are tested:
|
||||
* -# All threads exchange to a single, compile time deducible, memory location
|
||||
* -# Each thread targets an array containing warp_size elements, using tid % warp_size
|
||||
* for indexing
|
||||
* -# Same as the above, but the elements are spread out by L1 cache line size bytes.
|
||||
*
|
||||
* - The test is run for:
|
||||
* - All overloads of atomicCAS_system
|
||||
* - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
|
||||
* - Several grid and block dimension combinations.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - unit/atomics/atomicCAS_system.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEMPLATE_TEST_CASE("Unit_atomicCAS_system_Positive_Host_And_Peer_GPUs", "", int, unsigned int,
|
||||
unsigned long long TYPES) {
|
||||
int warp_size = 0;
|
||||
HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
|
||||
const auto cache_line_size = 128u;
|
||||
|
||||
for (auto current = 0; current < cmd_options.iterations; ++current) {
|
||||
DYNAMIC_SECTION("Same address " << current) {
|
||||
MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kCASAddSystem>(
|
||||
2, 2, 1, sizeof(TestType), 4);
|
||||
}
|
||||
|
||||
DYNAMIC_SECTION("Adjacent addresses " << current) {
|
||||
MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kCASAddSystem>(
|
||||
2, 2, warp_size, sizeof(TestType), 4);
|
||||
}
|
||||
|
||||
DYNAMIC_SECTION("Scattered addresses " << current) {
|
||||
MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kCASAddSystem>(
|
||||
2, 2, warp_size, cache_line_size, 4);
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user