2
0

EXSWHTEC-287 - Implement tests for trigonometric device math functions #231

Change-Id: I44188fa2f60f2492e05596464c914b8c739be482


[ROCm/hip-tests commit: 36620358e6]
Este cometimento está contido em:
Dino Music
2024-01-22 18:59:43 +05:30
cometido por Rakesh Roy
ascendente db33e97368
cometimento e06f1302bf
13 ficheiros modificados com 2159 adições e 0 eliminações
+11
Ver ficheiro
@@ -19,8 +19,19 @@
# THE SOFTWARE.
set(TEST_SRC
trig_funcs.cc
)
hip_add_exe_to_target(NAME MathsTest
TEST_SRC ${TEST_SRC}
TEST_TARGET_NAME build_tests COMMON_SHARED_SRC ${COMMON_SHARED_SRC})
add_test(NAME Unit_Device_Single_Precision_Trig_Functions_Negative
COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../compileAndCaptureOutput.py
${CMAKE_CURRENT_SOURCE_DIR} ${HIP_PLATFORM} ${HIP_PATH}
trig_single_precision_negative_kernels.cc 66)
add_test(NAME Unit_Device_Double_Precision_Trig_Functions_Negative
COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../compileAndCaptureOutput.py
${CMAKE_CURRENT_SOURCE_DIR} ${HIP_PLATFORM} ${HIP_PATH}
trig_double_precision_negative_kernels.cc 66)
+136
Ver ficheiro
@@ -0,0 +1,136 @@
/*
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include "math_common.hh"
#include "math_special_values.hh"
#include <hip/hip_cooperative_groups.h>
namespace cg = cooperative_groups;
#define MATH_BINARY_KERNEL_DEF(func_name) \
template <typename T, typename RT = T> \
__global__ void func_name##_kernel(RT* const ys, const size_t num_xs, T* const x1s, \
T* const x2s) { \
const auto tid = cg::this_grid().thread_rank(); \
const auto stride = cg::this_grid().size(); \
\
for (auto i = tid; i < num_xs; i += stride) { \
if constexpr (std::is_same_v<float, T>) { \
ys[i] = func_name##f(x1s[i], x2s[i]); \
} else if constexpr (std::is_same_v<double, T>) { \
ys[i] = func_name(x1s[i], x2s[i]); \
} \
} \
}
template <typename T, typename TArg, typename RT, typename RTArg, typename ValidatorBuilder>
void BinaryFloatingPointBruteForceTest(kernel_sig<T, TArg, TArg> kernel,
ref_sig<RT, RTArg, RTArg> ref_func,
const ValidatorBuilder& validator_builder,
const TArg a = std::numeric_limits<TArg>::lowest(),
const TArg b = std::numeric_limits<TArg>::max()) {
const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
const uint64_t num_iterations = GetTestIterationCount();
const auto max_batch_size =
std::min(GetMaxAllowedDeviceMemoryUsage() / (sizeof(TArg) * 2 + sizeof(T)), num_iterations);
LinearAllocGuard<TArg> x1s{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(TArg)};
LinearAllocGuard<TArg> x2s{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(TArg)};
MathTest math_test(kernel, max_batch_size);
auto batch_size = max_batch_size;
const auto num_threads = thread_pool.thread_count();
for (uint64_t i = 0ul; i < num_iterations; i += batch_size) {
batch_size = std::min<uint64_t>(max_batch_size, num_iterations - i);
const auto min_sub_batch_size = batch_size / num_threads;
const auto tail = batch_size % num_threads;
auto base_idx = 0u;
for (auto i = 0u; i < num_threads; ++i) {
const auto sub_batch_size = min_sub_batch_size + (i < tail);
thread_pool.Post([=, &x1s, &x2s] {
const auto generator = [=] {
static thread_local std::mt19937 rng(std::random_device{}());
std::uniform_real_distribution<RefType_t<TArg>> unif_dist(a, b);
return static_cast<TArg>(unif_dist(rng));
};
std::generate(x1s.ptr() + base_idx, x1s.ptr() + base_idx + sub_batch_size, generator);
std::generate(x2s.ptr() + base_idx, x2s.ptr() + base_idx + sub_batch_size, generator);
});
base_idx += sub_batch_size;
}
thread_pool.Wait();
math_test.Run(validator_builder, grid_size, block_size, ref_func, batch_size, x1s.ptr(),
x2s.ptr());
}
}
template <typename T, typename TArg, typename RT, typename RTArg, typename ValidatorBuilder>
void BinaryFloatingPointSpecialValuesTest(kernel_sig<T, TArg, TArg> kernel,
ref_sig<RT, RTArg, RTArg> ref_func,
const ValidatorBuilder& validator_builder) {
const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
const auto values = std::get<SpecialVals<TArg>>(kSpecialValRegistry);
const auto size = values.size * values.size;
LinearAllocGuard<TArg> x1s{LinearAllocs::hipHostMalloc, size * sizeof(TArg)};
LinearAllocGuard<TArg> x2s{LinearAllocs::hipHostMalloc, size * sizeof(TArg)};
for (auto i = 0u; i < values.size; ++i) {
for (auto j = 0u; j < values.size; ++j) {
x1s.ptr()[i * values.size + j] = values.data[i];
x2s.ptr()[i * values.size + j] = values.data[j];
}
}
MathTest math_test(kernel, size);
math_test.template Run<false>(validator_builder, grid_size, block_size, ref_func, size, x1s.ptr(),
x2s.ptr());
}
template <typename T, typename TArg, typename RT, typename RTArg, typename ValidatorBuilder>
void BinaryFloatingPointTest(kernel_sig<T, TArg, TArg> kernel, ref_sig<RT, RTArg, RTArg> ref_func,
const ValidatorBuilder& validator_builder) {
SECTION("Special values") {
BinaryFloatingPointSpecialValuesTest(kernel, ref_func, validator_builder);
}
SECTION("Brute force") { BinaryFloatingPointBruteForceTest(kernel, ref_func, validator_builder); }
}
#define MATH_BINARY_WITHIN_ULP_TEST_DEF(kern_name, ref_func, sp_ulp, dp_ulp) \
MATH_BINARY_KERNEL_DEF(kern_name) \
\
TEMPLATE_TEST_CASE("Unit_Device_" #kern_name "_Accuracy_Positive", "", float, double) { \
using RT = RefType_t<TestType>; \
RT (*ref)(RT, RT) = ref_func; \
const auto ulp = std::is_same_v<float, TestType> ? sp_ulp : dp_ulp; \
\
BinaryFloatingPointTest(kern_name##_kernel<TestType>, ref, \
ULPValidatorBuilderFactory<TestType>(ulp)); \
}
+240
Ver ficheiro
@@ -0,0 +1,240 @@
/*
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include <cmd_options.hh>
#include <hip_test_common.hh>
#include <resource_guards.hh>
#include <hip/hip_cooperative_groups.h>
#include "thread_pool.hh"
#include "validators.hh"
namespace cg = cooperative_groups;
template <typename T, typename U>
std::enable_if_t<std::conjunction_v<std::is_arithmetic<T>, std::is_arithmetic<U>>, std::ostream&>
operator<<(std::ostream& os, const std::pair<T, U>& p) {
const auto default_prec = os.precision();
return os << "<" << std::setprecision(std::numeric_limits<T>::max_digits10 - 1) << p.first << ", "
<< std::setprecision(std::numeric_limits<U>::max_digits10 - 1) << p.second << ">"
<< std::setprecision(default_prec);
}
// This class represents a generic numerical accuracy math test. Template parameter T is the output
// type of the function being tested, and template parameter pack Ts represents the input types. The
// constructor takes a kernel with the signature void(T*, const size_t, Ts*...). The first kernel
// parameter is the output array, the second parameter is the number of outputs, and the rest of the
// parameters are arrays containing input values. The number of input arrays depends on the arity of
// the function being tested e.g. one input array for unary functions, two input arrays for binary
// functions, etc. The kernel threads take one element from each input array at the index
// corresponding to that thread, feed the input elements to the testee function, and store the
// result in the output array at the corresponding index.
//
// E.g. for a binary function the kernel would have the following signature:
// void kernel(float* y, const size_t n, float* x1, float* x2)
//
// The outputs would be calculated in parallel the following way:
// y[0] = testee(x1[0], x2[0])
// y[1] = testee(x1[1], x2[1])
// y[2] = testee(x1[2], x2[2])
// ...
//
// The constructor also takes max_num_args, which represents the maximum number of input values used
// for one kernel launch. The device memory for the input and output arrays is allocated based on
// that number.
template <typename T, typename... Ts> class MathTest {
public:
MathTest(void (*kernel)(T*, const size_t, Ts*...), const size_t max_num_args)
: kernel_{kernel},
xss_dev_(LinearAllocGuard<Ts>(LinearAllocs::hipMalloc, max_num_args * sizeof(Ts))...),
y_dev_{LinearAllocs::hipMalloc, max_num_args * sizeof(T)},
y_{LinearAllocs::hipHostMalloc, max_num_args * sizeof(T)} {}
// This method runs the test with the following steps:
// 1. Copy the values from the input arrays provided in the parameter pack xss to device memory
// 2. Launch the kernel using the configuration provided in grid_dims and block_dims
// 3. Copy the outputs back to host memory
// 4. Generate the reference values using ref_func and compare against the outputs using the
// validator provided by validator_builder
// 5. If non-type template parameter parallel is true, then step 4 is broken up into chunks of
// work that are done in parallel on the host.
template <bool parallel = true, typename RT, typename ValidatorBuilder, typename... RTs>
void Run(const ValidatorBuilder& validator_builder, const size_t grid_dims,
const size_t block_dims, RT (*const ref_func)(RTs...), const size_t num_args,
const Ts*... xss) {
fail_flag_.store(false);
error_info_.clear();
RunImpl<parallel>(validator_builder, grid_dims, block_dims, ref_func, num_args,
std::index_sequence_for<Ts...>{}, xss...);
}
private:
void (*kernel_)(T*, const size_t, Ts*...);
std::tuple<LinearAllocGuard<Ts>...> xss_dev_;
LinearAllocGuard<T> y_dev_;
LinearAllocGuard<T> y_;
std::atomic<bool> fail_flag_{false};
std::mutex mtx_;
std::string error_info_;
template <bool parallel, typename RT, typename ValidatorBuilder, typename... RTs, size_t... I>
void RunImpl(const ValidatorBuilder& validator_builder, const size_t grid_dim,
const size_t block_dim, RT (*const ref_func)(RTs...), const size_t num_args,
std::index_sequence<I...> is, const Ts*... xss) {
const auto xss_tup = std::make_tuple(xss...);
constexpr auto f = [](auto dst, auto src, size_t size) {
HIP_CHECK(hipMemcpy(dst, src, size, hipMemcpyHostToDevice))
};
((f(std::get<I>(xss_dev_).ptr(), std::get<I>(xss_tup),
num_args * sizeof(*std::get<I>(xss_tup)))),
...);
kernel_<<<grid_dim, block_dim>>>(y_dev_.ptr(), num_args, std::get<I>(xss_dev_).ptr()...);
HIP_CHECK(hipGetLastError());
HIP_CHECK(hipMemcpy(y_.ptr(), y_dev_.ptr(), num_args * sizeof(T), hipMemcpyDeviceToHost));
HIP_CHECK(hipStreamSynchronize(nullptr));
if constexpr (!parallel) {
for (auto i = 0u; i < num_args; ++i) {
const auto actual_val = y_.ptr()[i];
const auto ref_val = static_cast<T>(ref_func(xss[i]...));
const auto validator = validator_builder(ref_val, xss[i]...);
if (!validator->match(actual_val)) {
const auto log = MakeLogMessage(actual_val, xss[i]...) + validator->describe() + "\n";
INFO(log);
REQUIRE(false);
}
}
return;
}
const auto task = [&, this](size_t iters, size_t base_idx) {
for (auto i = 0u; i < iters; ++i) {
if (fail_flag_.load(std::memory_order_relaxed)) return;
const auto actual_val = y_.ptr()[base_idx + i];
const auto ref_val = static_cast<T>(ref_func(xss[base_idx + i]...));
const auto validator = validator_builder(ref_val, xss[base_idx + i]...);
if (!validator->match(actual_val)) {
fail_flag_.store(true, std::memory_order_relaxed);
// Several threads might have passed the first check, but failed validation. On the
// chance of this happening, access to the string stream must be serialized.
const auto log =
MakeLogMessage(actual_val, xss[base_idx + i]...) + validator->describe() + "\n";
{
std::lock_guard lg{mtx_};
error_info_ += log;
}
return;
}
}
};
const auto task_count = thread_pool.thread_count();
const auto chunk_size = num_args / task_count;
const auto tail = num_args % task_count;
auto base_idx = 0u;
for (auto i = 0u; i < task_count; ++i) {
const auto iters = chunk_size + (i < tail);
thread_pool.Post([=, &task] { task(iters, base_idx); });
base_idx += iters;
}
thread_pool.Wait();
INFO(error_info_);
REQUIRE(!fail_flag_);
}
template <typename... Args> std::string MakeLogMessage(T actual_val, Args... args) {
std::stringstream ss;
ss << "Input value(s): " << std::scientific
<< std::setprecision(std::numeric_limits<T>::max_digits10 - 1);
((ss << " " << args), ...) << "\n" << actual_val << " ";
return ss.str();
}
};
template <typename T> struct RefType {};
template <> struct RefType<float> { using type = double; };
template <> struct RefType<double> { using type = long double; };
template <typename T> using RefType_t = typename RefType<T>::type;
template <typename F> auto GetOccupancyMaxPotentialBlockSize(F kernel) {
int grid_size = 0, block_size = 0;
HIP_CHECK(hipOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, 0, 0));
return std::make_tuple(grid_size, block_size);
}
inline size_t GetMaxAllowedDeviceMemoryUsage() {
hipDeviceProp_t props;
HIP_CHECK(hipGetDeviceProperties(&props, 0));
return props.totalGlobalMem * (cmd_options.accuracy_max_memory * 0.01f);
}
inline uint64_t GetTestIterationCount() { return cmd_options.accuracy_iterations; }
template <typename T, typename... Ts> using kernel_sig = void (*)(T*, const size_t, Ts*...);
template <typename T, typename... Ts> using ref_sig = T (*)(Ts...);
template <int error_num> void NegativeTestRTCWrapper(const char* program_source) {
hiprtcProgram program{};
HIPRTC_CHECK(
hiprtcCreateProgram(&program, program_source, "math_test_rtc.cc", 0, nullptr, nullptr));
hiprtcResult result{hiprtcCompileProgram(program, 0, nullptr)};
// Get the compile log and count compiler error messages
size_t log_size{};
HIPRTC_CHECK(hiprtcGetProgramLogSize(program, &log_size));
std::string log(log_size, ' ');
HIPRTC_CHECK(hiprtcGetProgramLog(program, log.data()));
int error_count{0};
int expected_error_count{error_num};
std::string error_message{"error:"};
size_t n_pos = log.find(error_message, 0);
while (n_pos != std::string::npos) {
++error_count;
n_pos = log.find(error_message, n_pos + 1);
}
HIPRTC_CHECK(hiprtcDestroyProgram(&program));
HIPRTC_CHECK_ERROR(result, HIPRTC_ERROR_COMPILATION);
REQUIRE(error_count == expected_error_count);
}
+287
Ver ficheiro
@@ -0,0 +1,287 @@
//
// Copyright (c) 2017 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Disclaimer:
// This code is based on the work found in OpenCL-CTS authored by The Khronos Group.
// The original code can be found at https://github.com/KhronosGroup/OpenCL-CTS.
// We acknowledge the contributions of The Khronos Group to the development of this code.
#pragma once
#include <array>
#include <limits>
/*-----------------------------------------------------------------------------
HEX_FLT, HEXT_DBL, HEX_LDBL -- Create hex floating point literal of type
float, double, long double respectively. Arguments:
sm -- sign of number,
int -- integer part of mantissa (without `0x' prefix),
fract -- fractional part of mantissa (without decimal point and `L' or
`LL' suffixes),
se -- sign of exponent,
exp -- absolute value of (binary) exponent.
Example:
double yhi = HEX_DBL(+, 1, 5555555555555, -, 2); // 0x1.5555555555555p-2
Note:
We have to pass signs as separate arguments because gcc pass negative
integer values (e. g. `-2') into a macro as two separate tokens, so
`HEX_FLT(1, 0, -2)' produces result `0x1.0p- 2' (note a space between minus
and two) which is not a correct floating point literal.
-----------------------------------------------------------------------------*/
#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
// If compiler does not support hex floating point literals:
#define HEX_FLT(sm, int, fract, se, exp) \
sm ldexpf((float)(0x##int##fract##UL), \
se exp + ilogbf((float)0x##int) - ilogbf((float)(0x##int##fract##UL)))
#define HEX_DBL(sm, int, fract, se, exp) \
sm ldexp((double)(0x##int##fract##ULL), \
se exp + ilogb((double)0x##int) - ilogb((double)(0x##int##fract##ULL)))
#define HEX_LDBL(sm, int, fract, se, exp) \
sm ldexpl((long double)(0x##int##fract##ULL), \
se exp + ilogbl((long double)0x##int) - ilogbl((long double)(0x##int##fract##ULL)))
#else
// If compiler supports hex floating point literals: just concatenate all the
// parts into a literal.
#define HEX_FLT(sm, int, fract, se, exp) sm 0x##int##.##fract##p##se##exp##F
#define HEX_DBL(sm, int, fract, se, exp) sm 0x##int##.##fract##p##se##exp
#define HEX_LDBL(sm, int, fract, se, exp) sm 0x##int##.##fract##p##se##exp##L
#endif
inline constexpr std::array kSpecialValuesDouble{
-std::numeric_limits<double>::quiet_NaN(),
-std::numeric_limits<double>::infinity(),
-std::numeric_limits<double>::max(),
HEX_DBL(-, 1, 0000000000001, +, 64),
HEX_DBL(-, 1, 0, +, 64),
HEX_DBL(-, 1, fffffffffffff, +, 63),
HEX_DBL(-, 1, 0000000000001, +, 63),
HEX_DBL(-, 1, 0, +, 63),
HEX_DBL(-, 1, fffffffffffff, +, 62),
HEX_DBL(-, 1, 000002, +, 32),
HEX_DBL(-, 1, 0, +, 32),
HEX_DBL(-, 1, fffffffffffff, +, 31),
HEX_DBL(-, 1, 0000000000001, +, 31),
HEX_DBL(-, 1, 0, +, 31),
HEX_DBL(-, 1, fffffffffffff, +, 30),
-1000.0,
-100.0,
-4.0,
-3.5,
-3.0,
HEX_DBL(-, 1, 8000000000001, +, 1),
-2.5,
HEX_DBL(-, 1, 7ffffffffffff, +, 1),
-2.0,
HEX_DBL(-, 1, 8000000000001, +, 0),
-1.5,
HEX_DBL(-, 1, 7ffffffffffff, +, 0),
HEX_DBL(-, 1, 0000000000001, +, 0),
-1.0,
HEX_DBL(-, 1, fffffffffffff, -, 1),
HEX_DBL(-, 1, 0000000000001, -, 1),
-0.5,
HEX_DBL(-, 1, fffffffffffff, -, 2),
HEX_DBL(-, 1, 0000000000001, -, 2),
-0.25,
HEX_DBL(-, 1, fffffffffffff, -, 3),
HEX_DBL(-, 1, 0000000000001, -, 1022),
-std::numeric_limits<double>::min(),
HEX_DBL(-, 0, fffffffffffff, -, 1022),
HEX_DBL(-, 0, 0000000000fff, -, 1022),
HEX_DBL(-, 0, 00000000000fe, -, 1022),
HEX_DBL(-, 0, 000000000000e, -, 1022),
HEX_DBL(-, 0, 000000000000c, -, 1022),
HEX_DBL(-, 0, 000000000000a, -, 1022),
HEX_DBL(-, 0, 0000000000008, -, 1022),
HEX_DBL(-, 0, 0000000000007, -, 1022),
HEX_DBL(-, 0, 0000000000006, -, 1022),
HEX_DBL(-, 0, 0000000000005, -, 1022),
HEX_DBL(-, 0, 0000000000004, -, 1022),
HEX_DBL(-, 0, 0000000000003, -, 1022),
HEX_DBL(-, 0, 0000000000002, -, 1022),
HEX_DBL(-, 0, 0000000000001, -, 1022),
-0.0,
std::numeric_limits<double>::quiet_NaN(),
std::numeric_limits<double>::infinity(),
std::numeric_limits<double>::max(),
HEX_DBL(+, 1, 0000000000001, +, 64),
HEX_DBL(+, 1, 0, +, 64),
HEX_DBL(+, 1, fffffffffffff, +, 63),
HEX_DBL(+, 1, 0000000000001, +, 63),
HEX_DBL(+, 1, 0, +, 63),
HEX_DBL(+, 1, fffffffffffff, +, 62),
HEX_DBL(+, 1, 000002, +, 32),
HEX_DBL(+, 1, 0, +, 32),
HEX_DBL(+, 1, fffffffffffff, +, 31),
HEX_DBL(+, 1, 0000000000001, +, 31),
HEX_DBL(+, 1, 0, +, 31),
HEX_DBL(+, 1, fffffffffffff, +, 30),
+1000.0,
+100.0,
+4.0,
+3.5,
+3.0,
HEX_DBL(+, 1, 8000000000001, +, 1),
+2.5,
HEX_DBL(+, 1, 7ffffffffffff, +, 1),
+2.0,
HEX_DBL(+, 1, 8000000000001, +, 0),
+1.5,
HEX_DBL(+, 1, 7ffffffffffff, +, 0),
HEX_DBL(+, 1, 0000000000001, +, 0),
+1.0,
HEX_DBL(+, 1, fffffffffffff, -, 1),
HEX_DBL(+, 1, 0000000000001, -, 1),
+0.5,
HEX_DBL(+, 1, fffffffffffff, -, 2),
HEX_DBL(+, 1, 0000000000001, -, 2),
+0.25,
HEX_DBL(+, 1, fffffffffffff, -, 3),
HEX_DBL(+, 1, 0000000000001, -, 1022),
+std::numeric_limits<double>::min(),
HEX_DBL(+, 0, fffffffffffff, -, 1022),
HEX_DBL(+, 0, 0000000000fff, -, 1022),
HEX_DBL(+, 0, 00000000000fe, -, 1022),
HEX_DBL(+, 0, 000000000000e, -, 1022),
HEX_DBL(+, 0, 000000000000c, -, 1022),
HEX_DBL(+, 0, 000000000000a, -, 1022),
HEX_DBL(+, 0, 0000000000008, -, 1022),
HEX_DBL(+, 0, 0000000000007, -, 1022),
HEX_DBL(+, 0, 0000000000006, -, 1022),
HEX_DBL(+, 0, 0000000000005, -, 1022),
HEX_DBL(+, 0, 0000000000004, -, 1022),
HEX_DBL(+, 0, 0000000000003, -, 1022),
HEX_DBL(+, 0, 0000000000002, -, 1022),
HEX_DBL(+, 0, 0000000000001, -, 1022),
+0.0,
};
inline constexpr std::array kSpecialValuesFloat{
-std::numeric_limits<float>::quiet_NaN(),
-std::numeric_limits<float>::infinity(),
-std::numeric_limits<float>::max(),
HEX_FLT(-, 1, 000002, +, 64),
HEX_FLT(-, 1, 0, +, 64),
HEX_FLT(-, 1, fffffe, +, 63),
HEX_FLT(-, 1, 000002, +, 63),
HEX_FLT(-, 1, 0, +, 63),
HEX_FLT(-, 1, fffffe, +, 62),
HEX_FLT(-, 1, 000002, +, 32),
HEX_FLT(-, 1, 0, +, 32),
HEX_FLT(-, 1, fffffe, +, 31),
HEX_FLT(-, 1, 000002, +, 31),
HEX_FLT(-, 1, 0, +, 31),
HEX_FLT(-, 1, fffffe, +, 30),
-1000.f,
-100.f,
-4.0f,
-3.5f,
-3.0f,
HEX_FLT(-, 1, 800002, +, 1),
-2.5f,
HEX_FLT(-, 1, 7ffffe, +, 1),
-2.0f,
HEX_FLT(-, 1, 800002, +, 0),
-1.5f,
HEX_FLT(-, 1, 7ffffe, +, 0),
HEX_FLT(-, 1, 000002, +, 0),
-1.0f,
HEX_FLT(-, 1, fffffe, -, 1),
HEX_FLT(-, 1, 000002, -, 1),
-0.5f,
HEX_FLT(-, 1, fffffe, -, 2),
HEX_FLT(-, 1, 000002, -, 2),
-0.25f,
HEX_FLT(-, 1, fffffe, -, 3),
HEX_FLT(-, 1, 000002, -, 126),
-std::numeric_limits<float>::min(),
HEX_FLT(-, 0, fffffe, -, 126),
HEX_FLT(-, 0, 000ffe, -, 126),
HEX_FLT(-, 0, 0000fe, -, 126),
HEX_FLT(-, 0, 00000e, -, 126),
HEX_FLT(-, 0, 00000c, -, 126),
HEX_FLT(-, 0, 00000a, -, 126),
HEX_FLT(-, 0, 000008, -, 126),
HEX_FLT(-, 0, 000006, -, 126),
HEX_FLT(-, 0, 000004, -, 126),
HEX_FLT(-, 0, 000002, -, 126),
-0.0f,
std::numeric_limits<float>::quiet_NaN(),
std::numeric_limits<float>::infinity(),
std::numeric_limits<float>::max(),
HEX_FLT(+, 1, 000002, +, 64),
HEX_FLT(+, 1, 0, +, 64),
HEX_FLT(+, 1, fffffe, +, 63),
HEX_FLT(+, 1, 000002, +, 63),
HEX_FLT(+, 1, 0, +, 63),
HEX_FLT(+, 1, fffffe, +, 62),
HEX_FLT(+, 1, 000002, +, 32),
HEX_FLT(+, 1, 0, +, 32),
HEX_FLT(+, 1, fffffe, +, 31),
HEX_FLT(+, 1, 000002, +, 31),
HEX_FLT(+, 1, 0, +, 31),
HEX_FLT(+, 1, fffffe, +, 30),
+1000.f,
+100.f,
+4.0f,
+3.5f,
+3.0f,
HEX_FLT(+, 1, 800002, +, 1),
2.5f,
HEX_FLT(+, 1, 7ffffe, +, 1),
+2.0f,
HEX_FLT(+, 1, 800002, +, 0),
1.5f,
HEX_FLT(+, 1, 7ffffe, +, 0),
HEX_FLT(+, 1, 000002, +, 0),
+1.0f,
HEX_FLT(+, 1, fffffe, -, 1),
HEX_FLT(+, 1, 000002, -, 1),
+0.5f,
HEX_FLT(+, 1, fffffe, -, 2),
HEX_FLT(+, 1, 000002, -, 2),
+0.25f,
HEX_FLT(+, 1, fffffe, -, 3),
HEX_FLT(+, 1, 000002, -, 126),
+std::numeric_limits<float>::min(),
HEX_FLT(+, 0, fffffe, -, 126),
HEX_FLT(+, 0, 000ffe, -, 126),
HEX_FLT(+, 0, 0000fe, -, 126),
HEX_FLT(+, 0, 00000e, -, 126),
HEX_FLT(+, 0, 00000c, -, 126),
HEX_FLT(+, 0, 00000a, -, 126),
HEX_FLT(+, 0, 000008, -, 126),
HEX_FLT(+, 0, 000006, -, 126),
HEX_FLT(+, 0, 000004, -, 126),
HEX_FLT(+, 0, 000002, -, 126),
+0.0f,
};
template <typename T> struct SpecialVals {
const T* const data;
const size_t size;
};
inline constexpr auto kSpecialValRegistry =
std::make_tuple(SpecialVals<float>{kSpecialValuesFloat.data(), kSpecialValuesFloat.size()},
SpecialVals<double>{kSpecialValuesDouble.data(), kSpecialValuesDouble.size()});
+246
Ver ficheiro
@@ -0,0 +1,246 @@
/*
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include "math_common.hh"
#include "math_special_values.hh"
#include <hip/hip_cooperative_groups.h>
namespace cg = cooperative_groups;
#define MATH_QUATERNARY_KERNEL_DEF(func_name) \
template <typename T> \
__global__ void func_name##_kernel(T* const ys, const size_t num_xs, T* const x1s, T* const x2s, \
T* const x3s, T* const x4s) { \
const auto tid = cg::this_grid().thread_rank(); \
const auto stride = cg::this_grid().size(); \
\
for (auto i = tid; i < num_xs; i += stride) { \
if constexpr (std::is_same_v<float, T>) { \
ys[i] = func_name##f(x1s[i], x2s[i], x3s[i], x4s[i]); \
} else if constexpr (std::is_same_v<double, T>) { \
ys[i] = func_name(x1s[i], x2s[i], x3s[i], x4s[i]); \
} \
} \
}
inline constexpr std::array kSpecialValuesReducedDouble{
-std::numeric_limits<double>::quiet_NaN(),
-std::numeric_limits<double>::infinity(),
-std::numeric_limits<double>::max(),
HEX_DBL(-, 1, 0000000000001, +, 64),
HEX_DBL(-, 1, fffffffffffff, +, 63),
HEX_DBL(-, 1, fffffffffffff, +, 62),
HEX_DBL(-, 1, 0, +, 32),
HEX_DBL(-, 1, 0000000000001, +, 31),
HEX_DBL(-, 1, fffffffffffff, +, 30),
-1000.0,
-3.5,
HEX_DBL(-, 1, 8000000000001, +, 1),
-2.5,
HEX_DBL(-, 1, 8000000000001, +, 0),
-1.5,
-0.5,
-0.25,
HEX_DBL(-, 1, fffffffffffff, -, 3),
-std::numeric_limits<double>::min(),
HEX_DBL(-, 0, fffffffffffff, -, 1022),
HEX_DBL(-, 0, 0000000000001, -, 1022),
-0.0,
std::numeric_limits<double>::quiet_NaN(),
std::numeric_limits<double>::infinity(),
std::numeric_limits<double>::max(),
HEX_DBL(+, 1, 0, +, 64),
HEX_DBL(+, 1, 0000000000001, +, 63),
HEX_DBL(+, 1, 000002, +, 32),
HEX_DBL(+, 1, fffffffffffff, +, 31),
HEX_DBL(+, 1, 0, +, 31),
HEX_DBL(+, 1, fffffffffffff, +, 30),
+100.0,
+3.0,
HEX_DBL(+, 1, 7ffffffffffff, +, 1),
+2.0,
HEX_DBL(+, 1, 7ffffffffffff, +, 0),
+1.0,
HEX_DBL(+, 1, fffffffffffff, -, 2),
+std::numeric_limits<double>::min(),
HEX_DBL(+, 0, 0000000000fff, -, 1022),
HEX_DBL(+, 0, 0000000000007, -, 1022),
+0.0,
};
inline constexpr std::array kSpecialValuesReducedFloat{
-std::numeric_limits<float>::quiet_NaN(),
-std::numeric_limits<float>::infinity(),
-std::numeric_limits<float>::max(),
HEX_FLT(-, 1, 000002, +, 64),
HEX_FLT(-, 1, fffffe, +, 63),
HEX_FLT(-, 1, fffffe, +, 62),
HEX_FLT(-, 1, 0, +, 32),
HEX_FLT(-, 1, fffffe, +, 31),
HEX_FLT(-, 1, fffffe, +, 30),
-1000.f,
-3.5f,
HEX_FLT(-, 1, 800002, +, 1),
-2.5f,
HEX_FLT(-, 1, 800002, +, 0),
-1.5f,
-0.5f,
-0.25f,
HEX_FLT(-, 1, fffffe, -, 3),
-std::numeric_limits<float>::min(),
HEX_FLT(-, 0, fffffe, -, 126),
HEX_FLT(-, 0, 000002, -, 126),
-0.0f,
std::numeric_limits<float>::quiet_NaN(),
std::numeric_limits<float>::infinity(),
std::numeric_limits<float>::max(),
HEX_FLT(+, 1, 0, +, 64),
HEX_FLT(+, 1, 000002, +, 63),
HEX_FLT(+, 1, 000002, +, 32),
HEX_FLT(+, 1, 000002, +, 31),
HEX_FLT(+, 1, fffffe, +, 30),
+100.f,
+4.0f,
HEX_FLT(+, 1, 7ffffe, +, 1),
+2.0f,
HEX_FLT(+, 1, 7ffffe, +, 0),
+1.0f,
HEX_FLT(+, 1, fffffe, -, 2),
+std::numeric_limits<float>::min(),
HEX_FLT(+, 0, 000ffe, -, 126),
HEX_FLT(+, 0, 000006, -, 126),
+0.0f,
};
inline constexpr auto kSpecialValReducedRegistry = std::make_tuple(
SpecialVals<float>{kSpecialValuesReducedFloat.data(), kSpecialValuesReducedFloat.size()},
SpecialVals<double>{kSpecialValuesReducedDouble.data(), kSpecialValuesReducedDouble.size()});
template <typename T, typename TArg, typename RT, typename RTArg, typename ValidatorBuilder>
void QuaternaryFloatingPointBruteForceTest(kernel_sig<T, TArg, TArg, TArg, TArg> kernel,
ref_sig<RT, RTArg, RTArg, RTArg, RTArg> ref_func,
const ValidatorBuilder& validator_builder,
const TArg a = std::numeric_limits<TArg>::lowest(),
const TArg b = std::numeric_limits<TArg>::max()) {
const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
const uint64_t num_iterations = GetTestIterationCount();
const auto max_batch_size =
std::min(GetMaxAllowedDeviceMemoryUsage() / (sizeof(TArg) * 4 + sizeof(T)), num_iterations);
LinearAllocGuard<TArg> x1s{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(TArg)};
LinearAllocGuard<TArg> x2s{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(TArg)};
LinearAllocGuard<TArg> x3s{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(TArg)};
LinearAllocGuard<TArg> x4s{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(TArg)};
MathTest math_test(kernel, max_batch_size);
auto batch_size = max_batch_size;
const auto num_threads = thread_pool.thread_count();
for (uint64_t i = 0ul; i < num_iterations; i += batch_size) {
batch_size = std::min<uint64_t>(max_batch_size, num_iterations - i);
const auto min_sub_batch_size = batch_size / num_threads;
const auto tail = batch_size % num_threads;
auto base_idx = 0u;
for (auto i = 0u; i < num_threads; ++i) {
const auto sub_batch_size = min_sub_batch_size + (i < tail);
thread_pool.Post([=, &x1s, &x2s, &x3s, &x4s] {
const auto generator = [=] {
static thread_local std::mt19937 rng(std::random_device{}());
std::uniform_real_distribution<RefType_t<TArg>> unif_dist(a, b);
return static_cast<TArg>(unif_dist(rng));
};
std::generate(x1s.ptr() + base_idx, x1s.ptr() + base_idx + sub_batch_size, generator);
std::generate(x2s.ptr() + base_idx, x2s.ptr() + base_idx + sub_batch_size, generator);
std::generate(x3s.ptr() + base_idx, x3s.ptr() + base_idx + sub_batch_size, generator);
std::generate(x4s.ptr() + base_idx, x4s.ptr() + base_idx + sub_batch_size, generator);
});
base_idx += sub_batch_size;
}
thread_pool.Wait();
math_test.Run(validator_builder, grid_size, block_size, ref_func, batch_size, x1s.ptr(),
x2s.ptr(), x3s.ptr(), x4s.ptr());
}
}
template <typename T, typename TArg, typename RT, typename RTArg, typename ValidatorBuilder>
void QuaternaryFloatingPointSpecialValuesTest(kernel_sig<T, TArg, TArg, TArg, TArg> kernel,
ref_sig<RT, RTArg, RTArg, RTArg, RTArg> ref_func,
const ValidatorBuilder& validator_builder) {
const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
const auto values = std::get<SpecialVals<TArg>>(kSpecialValReducedRegistry);
const auto size = values.size * values.size * values.size * values.size;
LinearAllocGuard<TArg> x1s{LinearAllocs::hipHostMalloc, size * sizeof(TArg)};
LinearAllocGuard<TArg> x2s{LinearAllocs::hipHostMalloc, size * sizeof(TArg)};
LinearAllocGuard<TArg> x3s{LinearAllocs::hipHostMalloc, size * sizeof(TArg)};
LinearAllocGuard<TArg> x4s{LinearAllocs::hipHostMalloc, size * sizeof(TArg)};
for (auto i = 0u; i < values.size; ++i) {
for (auto j = 0u; j < values.size; ++j) {
for (auto k = 0u; k < values.size; ++k) {
for (auto l = 0u; l < values.size; ++l) {
x1s.ptr()[((i * values.size + j) * values.size + k) * values.size + l] = values.data[i];
x2s.ptr()[((i * values.size + j) * values.size + k) * values.size + l] = values.data[j];
x3s.ptr()[((i * values.size + j) * values.size + k) * values.size + l] = values.data[k];
x4s.ptr()[((i * values.size + j) * values.size + k) * values.size + l] = values.data[l];
}
}
}
}
MathTest math_test(kernel, size);
math_test.template Run<false>(validator_builder, grid_size, block_size, ref_func, size, x1s.ptr(),
x2s.ptr(), x3s.ptr(), x4s.ptr());
}
template <typename T, typename TArg, typename RT, typename RTArg, typename ValidatorBuilder>
void QuaternaryFloatingPointTest(kernel_sig<T, TArg, TArg, TArg, TArg> kernel,
ref_sig<RT, RTArg, RTArg, RTArg, RTArg> ref_func,
const ValidatorBuilder& validator_builder) {
SECTION("Special values") {
QuaternaryFloatingPointSpecialValuesTest(kernel, ref_func, validator_builder);
}
SECTION("Brute force") {
QuaternaryFloatingPointBruteForceTest(kernel, ref_func, validator_builder);
}
}
#define MATH_QUATERNARY_WITHIN_ULP_TEST_DEF(kern_name, ref_func, sp_ulp, dp_ulp) \
MATH_QUATERNARY_KERNEL_DEF(kern_name) \
\
TEMPLATE_TEST_CASE("Unit_Device_" #kern_name "_Accuracy_Positive", "", float, double) { \
using RT = RefType_t<TestType>; \
RT (*ref)(RT, RT, RT, RT) = ref_func; \
const auto ulp = std::is_same_v<float, TestType> ? sp_ulp : dp_ulp; \
\
QuaternaryFloatingPointTest(kern_name##_kernel<TestType>, ref, \
ULPValidatorBuilderFactory<TestType>(ulp)); \
}
+142
Ver ficheiro
@@ -0,0 +1,142 @@
/*
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include "math_common.hh"
#include "math_special_values.hh"
#include <hip/hip_cooperative_groups.h>
namespace cg = cooperative_groups;
#define MATH_TERNARY_KERNEL_DEF(func_name) \
template <typename T> \
__global__ void func_name##_kernel(T* const ys, const size_t num_xs, T* const x1s, T* const x2s, \
T* const x3s) { \
const auto tid = cg::this_grid().thread_rank(); \
const auto stride = cg::this_grid().size(); \
\
for (auto i = tid; i < num_xs; i += stride) { \
if constexpr (std::is_same_v<float, T>) { \
ys[i] = func_name##f(x1s[i], x2s[i], x3s[i]); \
} else if constexpr (std::is_same_v<double, T>) { \
ys[i] = func_name(x1s[i], x2s[i], x3s[i]); \
} \
} \
}
template <typename T, typename TArg, typename RT, typename RTArg, typename ValidatorBuilder>
void TernaryFloatingPointBruteForceTest(kernel_sig<T, TArg, TArg, TArg> kernel,
ref_sig<RT, RTArg, RTArg, RTArg> ref_func,
const ValidatorBuilder& validator_builder,
const TArg a = std::numeric_limits<TArg>::lowest(),
const TArg b = std::numeric_limits<TArg>::max()) {
const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
const uint64_t num_iterations = GetTestIterationCount();
const auto max_batch_size =
std::min(GetMaxAllowedDeviceMemoryUsage() / (sizeof(TArg) * 3 + sizeof(T)), num_iterations);
LinearAllocGuard<TArg> x1s{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(TArg)};
LinearAllocGuard<TArg> x2s{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(TArg)};
LinearAllocGuard<TArg> x3s{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(TArg)};
MathTest math_test(kernel, max_batch_size);
auto batch_size = max_batch_size;
const auto num_threads = thread_pool.thread_count();
for (uint64_t i = 0ul; i < num_iterations; i += batch_size) {
batch_size = std::min<uint64_t>(max_batch_size, num_iterations - i);
const auto min_sub_batch_size = batch_size / num_threads;
const auto tail = batch_size % num_threads;
auto base_idx = 0u;
for (auto i = 0u; i < num_threads; ++i) {
const auto sub_batch_size = min_sub_batch_size + (i < tail);
thread_pool.Post([=, &x1s, &x2s, &x3s] {
const auto generator = [=] {
static thread_local std::mt19937 rng(std::random_device{}());
std::uniform_real_distribution<RefType_t<TArg>> unif_dist(a, b);
return static_cast<TArg>(unif_dist(rng));
};
std::generate(x1s.ptr() + base_idx, x1s.ptr() + base_idx + sub_batch_size, generator);
std::generate(x2s.ptr() + base_idx, x2s.ptr() + base_idx + sub_batch_size, generator);
std::generate(x3s.ptr() + base_idx, x3s.ptr() + base_idx + sub_batch_size, generator);
});
base_idx += sub_batch_size;
}
thread_pool.Wait();
math_test.Run(validator_builder, grid_size, block_size, ref_func, batch_size, x1s.ptr(),
x2s.ptr(), x3s.ptr());
}
}
template <typename T, typename TArg, typename RT, typename RTArg, typename ValidatorBuilder>
void TernaryFloatingPointSpecialValuesTest(kernel_sig<T, TArg, TArg, TArg> kernel,
ref_sig<RT, RTArg, RTArg, RTArg> ref_func,
const ValidatorBuilder& validator_builder) {
const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
const auto values = std::get<SpecialVals<TArg>>(kSpecialValRegistry);
const auto size = values.size * values.size * values.size;
LinearAllocGuard<TArg> x1s{LinearAllocs::hipHostMalloc, size * sizeof(TArg)};
LinearAllocGuard<TArg> x2s{LinearAllocs::hipHostMalloc, size * sizeof(TArg)};
LinearAllocGuard<TArg> x3s{LinearAllocs::hipHostMalloc, size * sizeof(TArg)};
for (auto i = 0u; i < values.size; ++i) {
for (auto j = 0u; j < values.size; ++j) {
for (auto k = 0u; k < values.size; ++k) {
x1s.ptr()[(i * values.size + j) * values.size + k] = values.data[i];
x2s.ptr()[(i * values.size + j) * values.size + k] = values.data[j];
x3s.ptr()[(i * values.size + j) * values.size + k] = values.data[k];
}
}
}
MathTest math_test(kernel, size);
math_test.template Run<false>(validator_builder, grid_size, block_size, ref_func, size, x1s.ptr(),
x2s.ptr(), x3s.ptr());
}
template <typename T, typename TArg, typename RT, typename RTArg, typename ValidatorBuilder>
void TernaryFloatingPointTest(kernel_sig<T, TArg, TArg, TArg> kernel, ref_sig<RT, RTArg, RTArg, RTArg> ref_func,
const ValidatorBuilder& validator_builder) {
SECTION("Special values") {
TernaryFloatingPointSpecialValuesTest(kernel, ref_func, validator_builder);
}
SECTION("Brute force") { TernaryFloatingPointBruteForceTest(kernel, ref_func, validator_builder); }
}
#define MATH_TERNARY_WITHIN_ULP_TEST_DEF(kern_name, ref_func, sp_ulp, dp_ulp) \
MATH_TERNARY_KERNEL_DEF(kern_name) \
\
TEMPLATE_TEST_CASE("Unit_Device_" #kern_name "_Accuracy_Positive", "", float, double) { \
using RT = RefType_t<TestType>; \
RT (*ref)(RT, RT, RT) = ref_func; \
const auto ulp = std::is_same_v<float, TestType> ? sp_ulp : dp_ulp; \
\
TernaryFloatingPointTest(kern_name##_kernel<TestType>, ref, \
ULPValidatorBuilderFactory<TestType>(ulp)); \
}
+64
Ver ficheiro
@@ -0,0 +1,64 @@
/*
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include <atomic>
#include <thread>
#include <boost/asio/post.hpp>
#include <boost/asio/thread_pool.hpp>
// This is a simple wrapper around boost::asio::thread_pool that keeps track of the number of
// currently active tasks using an atomic counter.
class ThreadPool {
public:
ThreadPool(size_t thread_count = std::thread::hardware_concurrency())
: thread_count_(thread_count) {}
~ThreadPool() { thread_pool_.join(); }
// Submits a task to the thread pool and increments the number of active tasks. The task is
// wrapped in a lambda that decrements the number of active tasks upon completion.
template <typename T> void Post(T&& task) {
++active_tasks_;
auto&& task_wrapper = [task, this] {
task();
--active_tasks_;
};
boost::asio::post(thread_pool_, task_wrapper);
}
// Busy waits for the number of active tasks to reach zero.
void Wait() const {
while (active_tasks_.load(std::memory_order_relaxed))
;
}
size_t thread_count() const { return thread_count_; }
private:
const size_t thread_count_;
boost::asio::thread_pool thread_pool_{thread_count_};
std::atomic<size_t> active_tasks_;
};
inline ThreadPool thread_pool{};
@@ -0,0 +1,108 @@
/*
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip_test_common.hh>
class Dummy {
public:
__device__ Dummy() {}
__device__ ~Dummy() {}
};
#define TRIG_DP_UNARY_NEGATIVE_KERNELS(func_name) \
__global__ void func_name##_kernel_v1(double* x) { double result = func_name(x); } \
__global__ void func_name##_kernel_v2(Dummy x) { double result = func_name(x); }
/*Expecting 2 errors per macro invocation - 26 total*/
TRIG_DP_UNARY_NEGATIVE_KERNELS(sin)
TRIG_DP_UNARY_NEGATIVE_KERNELS(cos)
TRIG_DP_UNARY_NEGATIVE_KERNELS(tan)
TRIG_DP_UNARY_NEGATIVE_KERNELS(asin)
TRIG_DP_UNARY_NEGATIVE_KERNELS(acos)
TRIG_DP_UNARY_NEGATIVE_KERNELS(atan)
TRIG_DP_UNARY_NEGATIVE_KERNELS(sinh)
TRIG_DP_UNARY_NEGATIVE_KERNELS(cosh)
TRIG_DP_UNARY_NEGATIVE_KERNELS(tanh)
TRIG_DP_UNARY_NEGATIVE_KERNELS(asinh)
TRIG_DP_UNARY_NEGATIVE_KERNELS(atanh)
TRIG_DP_UNARY_NEGATIVE_KERNELS(sinpi)
TRIG_DP_UNARY_NEGATIVE_KERNELS(cospi)
/*Expecting 4 errors*/
__global__ void atan2_kernel_v1(double* x, double y) { double result = atan2(x, y); }
__global__ void atan2_kernel_v2(double x, double* y) { double result = atan2(x, y); }
__global__ void atan2_kernel_v3(Dummy x, double y) { double result = atan2(x, y); }
__global__ void atan2_kernel_v4(double x, Dummy y) { double result = atan2(x, y); }
/*Expecting 18 errors*/
__global__ void sincos_kernel_v1(double* x, double* sptr, double* cptr) { sincos(x, sptr, cptr); }
__global__ void sincos_kernel_v2(Dummy x, double* sptr, double* cptr) { sincos(x, sptr, cptr); }
__global__ void sincos_kernel_v3(double x, char* sptr, double* cptr) { sincos(x, sptr, cptr); }
__global__ void sincos_kernel_v4(double x, short* sptr, double* cptr) { sincos(x, sptr, cptr); }
__global__ void sincos_kernel_v5(double x, int* sptr, double* cptr) { sincos(x, sptr, cptr); }
__global__ void sincos_kernel_v6(double x, long* sptr, double* cptr) { sincos(x, sptr, cptr); }
__global__ void sincos_kernel_v7(double x, long long* sptr, double* cptr) { sincos(x, sptr, cptr); }
__global__ void sincos_kernel_v8(double x, float* sptr, double* cptr) { sincos(x, sptr, cptr); }
__global__ void sincos_kernel_v9(double x, Dummy* sptr, double* cptr) { sincos(x, sptr, cptr); }
__global__ void sincos_kernel_v10(double x, const double* sptr, double* cptr) {
sincos(x, sptr, cptr);
}
__global__ void sincos_kernel_v11(double x, double* sptr, char* cptr) { sincos(x, sptr, cptr); }
__global__ void sincos_kernel_v12(double x, double* sptr, short* cptr) { sincos(x, sptr, cptr); }
__global__ void sincos_kernel_v13(double x, double* sptr, int* cptr) { sincos(x, sptr, cptr); }
__global__ void sincos_kernel_v14(double x, double* sptr, long* cptr) { sincos(x, sptr, cptr); }
__global__ void sincos_kernel_v15(double x, double* sptr, long long* cptr) {
sincos(x, sptr, cptr);
}
__global__ void sincos_kernel_v16(double x, double* sptr, float* cptr) { sincos(x, sptr, cptr); }
__global__ void sincos_kernel_v17(double x, double* sptr, Dummy* cptr) { sincos(x, sptr, cptr); }
__global__ void sincos_kernel_v18(double x, double* sptr, const double* cptr) {
sincos(x, sptr, cptr);
}
/*Expecting 18 errors*/
__global__ void sincospi_kernel_v1(float* x, float* sptr, float* cptr) { sincospi(x, sptr, cptr); }
__global__ void sincospi_kernel_v2(Dummy x, float* sptr, float* cptr) { sincospi(x, sptr, cptr); }
__global__ void sincospi_kernel_v3(float x, char* sptr, float* cptr) { sincospi(x, sptr, cptr); }
__global__ void sincospi_kernel_v4(float x, short* sptr, float* cptr) { sincospi(x, sptr, cptr); }
__global__ void sincospi_kernel_v5(float x, int* sptr, float* cptr) { sincospi(x, sptr, cptr); }
__global__ void sincospi_kernel_v6(float x, long* sptr, float* cptr) { sincospi(x, sptr, cptr); }
__global__ void sincospi_kernel_v7(float x, long long* sptr, float* cptr) {
sincospi(x, sptr, cptr);
}
__global__ void sincospi_kernel_v8(float x, double* sptr, float* cptr) { sincospi(x, sptr, cptr); }
__global__ void sincospi_kernel_v9(float x, Dummy* sptr, float* cptr) { sincospi(x, sptr, cptr); }
__global__ void sincospi_kernel_v10(float x, const float* sptr, float* cptr) {
sincospi(x, sptr, cptr);
}
__global__ void sincospi_kernel_v11(float x, float* sptr, char* cptr) { sincospi(x, sptr, cptr); }
__global__ void sincospi_kernel_v12(float x, float* sptr, short* cptr) { sincospi(x, sptr, cptr); }
__global__ void sincospi_kernel_v13(float x, float* sptr, int* cptr) { sincospi(x, sptr, cptr); }
__global__ void sincospi_kernel_v14(float x, float* sptr, long* cptr) { sincospi(x, sptr, cptr); }
__global__ void sincospi_kernel_v15(float x, float* sptr, long long* cptr) {
sincospi(x, sptr, cptr);
}
__global__ void sincospi_kernel_v16(float x, float* sptr, double* cptr) { sincospi(x, sptr, cptr); }
__global__ void sincospi_kernel_v17(float x, float* sptr, Dummy* cptr) { sincospi(x, sptr, cptr); }
__global__ void sincospi_kernel_v18(float x, float* sptr, const float* cptr) {
sincospi(x, sptr, cptr);
}
+137
Ver ficheiro
@@ -0,0 +1,137 @@
/*
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "trig_negative_kernels_rtc.hh"
#include "unary_common.hh"
#include "binary_common.hh"
#include <boost/math/special_functions.hpp>
MATH_UNARY_WITHIN_ULP_TEST_DEF(sin, std::sin, 2, 2);
TEST_CASE("Unit_Device_sin_sinf_Negative_RTC") { NegativeTestRTCWrapper<4>(kSin); }
MATH_UNARY_WITHIN_ULP_TEST_DEF(cos, std::cos, 2, 2)
TEST_CASE("Unit_Device_cos_cosf_Negative_RTC") { NegativeTestRTCWrapper<4>(kCos); }
MATH_UNARY_WITHIN_ULP_TEST_DEF(tan, std::tan, 4, 2)
TEST_CASE("Unit_Device_tan_tanf_Negative_RTC") { NegativeTestRTCWrapper<4>(kTan); }
MATH_UNARY_WITHIN_ULP_TEST_DEF(asin, std::asin, 2, 2)
TEST_CASE("Unit_Device_asin_asinf_Negative_RTC") { NegativeTestRTCWrapper<4>(kAsin); }
MATH_UNARY_WITHIN_ULP_TEST_DEF(acos, std::acos, 2, 2)
TEST_CASE("Unit_Device_acos_acosf_Negative_RTC") { NegativeTestRTCWrapper<4>(kAcos); }
MATH_UNARY_WITHIN_ULP_TEST_DEF(atan, std::atan, 2, 2)
TEST_CASE("Unit_Device_atan_atanf_Negative_RTC") { NegativeTestRTCWrapper<4>(kAtan); }
MATH_UNARY_WITHIN_ULP_TEST_DEF(sinh, std::sinh, 3, 2)
TEST_CASE("Unit_Device_sinh_sinhf_Negative_RTC") { NegativeTestRTCWrapper<4>(kSinh); }
MATH_UNARY_WITHIN_ULP_TEST_DEF(cosh, std::cosh, 2, 1)
TEST_CASE("Unit_Device_cosh_coshf_Negative_RTC") { NegativeTestRTCWrapper<4>(kCosh); }
MATH_UNARY_WITHIN_ULP_TEST_DEF(tanh, std::tanh, 2, 1)
TEST_CASE("Unit_Device_tanh_tanhf_Negative_RTC") { NegativeTestRTCWrapper<4>(kTanh); }
MATH_UNARY_WITHIN_ULP_TEST_DEF(asinh, std::asinh, 3, 2)
TEST_CASE("Unit_Device_asinh_asinhf_Negative_RTC") { NegativeTestRTCWrapper<4>(kAsinh); }
MATH_UNARY_WITHIN_ULP_TEST_DEF(acosh, std::acosh, 4, 2)
TEST_CASE("Unit_Device_acosh_acoshf_Negative_RTC") { NegativeTestRTCWrapper<4>(kAcosh); }
MATH_UNARY_WITHIN_ULP_TEST_DEF(atanh, std::atanh, 3, 2)
TEST_CASE("Unit_Device_atanh_atanhf_Negative_RTC") { NegativeTestRTCWrapper<4>(kAtanh); }
MATH_UNARY_WITHIN_ULP_TEST_DEF(sinpi, boost::math::sin_pi, 2, 2);
TEST_CASE("Unit_Device_sinpi_sinpif_Negative_RTC") { NegativeTestRTCWrapper<4>(kSinpi); }
MATH_UNARY_WITHIN_ULP_TEST_DEF(cospi, boost::math::cos_pi, 2, 2);
TEST_CASE("Unit_Device_cospi_cospif_Negative_RTC") { NegativeTestRTCWrapper<4>(kCospi); }
MATH_BINARY_WITHIN_ULP_TEST_DEF(atan2, std::atan2, 3, 2);
TEST_CASE("Unit_Device_atan2_atan2f_Negative_RTC") { NegativeTestRTCWrapper<8>(kAtan2); }
template <typename T>
__global__ void sincos_kernel(std::pair<T, T>* const ys, const size_t num_xs, T* const xs) {
const auto tid = cg::this_grid().thread_rank();
const auto stride = cg::this_grid().size();
for (auto i = tid; i < num_xs; i += stride) {
if constexpr (std::is_same_v<float, T>) {
sincosf(xs[i], &ys[i].first, &ys[i].second);
} else if constexpr (std::is_same_v<double, T>) {
sincos(xs[i], &ys[i].first, &ys[i].second);
}
}
}
template <typename T> std::pair<T, T> sincos(T x) { return {std::sin(x), std::cos(x)}; }
TEST_CASE("Unit_Device_sincos_Accuracy_Positive - float") {
UnarySinglePrecisionTest(
sincos_kernel<float>, sincos<double>,
PairValidatorBuilderFactory<float>(ULPValidatorBuilderFactory<float>(2)));
}
TEST_CASE("Unit_Device_sincos_Accuracy_Positive - double") {
const auto validator_builder =
PairValidatorBuilderFactory<double>(ULPValidatorBuilderFactory<double>(2));
UnaryDoublePrecisionTest(sincos_kernel<double>, sincos<long double>, validator_builder);
}
TEST_CASE("Unit_Device_sincos_sincosf_Negative_RTC") { NegativeTestRTCWrapper<36>(kSincos); }
template <typename T>
__global__ void sincospi_kernel(std::pair<T, T>* const ys, const size_t num_xs, T* const xs) {
const auto tid = cg::this_grid().thread_rank();
const auto stride = cg::this_grid().size();
for (auto i = tid; i < num_xs; i += stride) {
if constexpr (std::is_same_v<float, T>) {
sincospif(xs[i], &ys[i].first, &ys[i].second);
} else if constexpr (std::is_same_v<double, T>) {
sincospi(xs[i], &ys[i].first, &ys[i].second);
}
}
}
template <typename T> std::pair<T, T> sincospi(T x) {
return {boost::math::sin_pi(x), boost::math::cos_pi(x)};
}
TEST_CASE("Unit_Device_sincospi_Accuracy_Positive - float") {
UnarySinglePrecisionTest(
sincospi_kernel<float>, sincospi<double>,
PairValidatorBuilderFactory<float>(ULPValidatorBuilderFactory<float>(2)));
}
TEST_CASE("Unit_Device_sincospi_Accuracy_Positive - double") {
const auto validator_builder =
PairValidatorBuilderFactory<double>(ULPValidatorBuilderFactory<double>(2));
UnaryDoublePrecisionTest(sincospi_kernel<double>, sincospi<long double>, validator_builder);
}
TEST_CASE("Unit_Device_sincospi_sincospif_Negative_RTC") { NegativeTestRTCWrapper<36>(kSincospi); }
@@ -0,0 +1,320 @@
// #define TRIG_UNARY_NEGATIVE_KERNELS(func_name)
// class Dummy {
// public:
// __device__ Dummy() {}
// __device__ ~Dummy() {}
// };
// __global__ void func_name##f_kernel_v1(float* x) { float result = func_name##f(x); }
// __global__ void func_name##f_kernel_v2(Dummy x) { float result = func_name##f(x); }
// __global__ void func_name##_kernel_v1(double* x) { double result = func_name(x); }
// __global__ void func_name##_kernel_v2(Dummy x) { double result = func_name(x); }
static constexpr auto kSin{R"(
class Dummy {
public:
__device__ Dummy() {}
__device__ ~Dummy() {}
};
__global__ void sinf_kernel_v1(float* x) { float result = sinf(x); }
__global__ void sinf_kernel_v2(Dummy x) { float result = sinf(x); }
__global__ void sin_kernel_v1(double* x) { double result = sin(x); }
__global__ void sin_kernel_v2(Dummy x) { double result = sin(x); }
)"};
static constexpr auto kCos{R"(
class Dummy {
public:
__device__ Dummy() {}
__device__ ~Dummy() {}
};
__global__ void cosf_kernel_v1(float* x) { float result = cosf(x); }
__global__ void cosf_kernel_v2(Dummy x) { float result = cosf(x); }
__global__ void cos_kernel_v1(double* x) { double result = cos(x); }
__global__ void cos_kernel_v2(Dummy x) { double result = cos(x); }
)"};
static constexpr auto kTan{R"(
class Dummy {
public:
__device__ Dummy() {}
__device__ ~Dummy() {}
};
__global__ void tanf_kernel_v1(float* x) { float result = tanf(x); }
__global__ void tanf_kernel_v2(Dummy x) { float result = tanf(x); }
__global__ void tan_kernel_v1(double* x) { double result = tan(x); }
__global__ void tan_kernel_v2(Dummy x) { double result = tan(x); }
)"};
static constexpr auto kAsin{R"(
class Dummy {
public:
__device__ Dummy() {}
__device__ ~Dummy() {}
};
__global__ void asinf_kernel_v1(float* x) { float result = asinf(x); }
__global__ void asinf_kernel_v2(Dummy x) { float result = asinf(x); }
__global__ void asin_kernel_v1(double* x) { double result = asin(x); }
__global__ void asin_kernel_v2(Dummy x) { double result = asin(x); }
)"};
static constexpr auto kAcos{R"(
class Dummy {
public:
__device__ Dummy() {}
__device__ ~Dummy() {}
};
__global__ void acosf_kernel_v1(float* x) { float result = acosf(x); }
__global__ void acosf_kernel_v2(Dummy x) { float result = acosf(x); }
__global__ void acos_kernel_v1(double* x) { double result = acos(x); }
__global__ void acos_kernel_v2(Dummy x) { double result = acos(x); }
)"};
static constexpr auto kAtan{R"(
class Dummy {
public:
__device__ Dummy() {}
__device__ ~Dummy() {}
};
__global__ void atanf_kernel_v1(float* x) { float result = atanf(x); }
__global__ void atanf_kernel_v2(Dummy x) { float result = atanf(x); }
__global__ void atan_kernel_v1(double* x) { double result = atan(x); }
__global__ void atan_kernel_v2(Dummy x) { double result = atan(x); }
)"};
static constexpr auto kSinh{R"(
class Dummy {
public:
__device__ Dummy() {}
__device__ ~Dummy() {}
};
__global__ void sinhf_kernel_v1(float* x) { float result = sinhf(x); }
__global__ void sinhf_kernel_v2(Dummy x) { float result = sinhf(x); }
__global__ void sinh_kernel_v1(double* x) { double result = sinh(x); }
__global__ void sinh_kernel_v2(Dummy x) { double result = sinh(x); }
)"};
static constexpr auto kCosh{R"(
class Dummy {
public:
__device__ Dummy() {}
__device__ ~Dummy() {}
};
__global__ void coshf_kernel_v1(float* x) { float result = coshf(x); }
__global__ void coshf_kernel_v2(Dummy x) { float result = coshf(x); }
__global__ void cosh_kernel_v1(double* x) { double result = cosh(x); }
__global__ void cosh_kernel_v2(Dummy x) { double result = cosh(x); }
)"};
static constexpr auto kTanh{R"(
class Dummy {
public:
__device__ Dummy() {}
__device__ ~Dummy() {}
};
__global__ void tanhf_kernel_v1(float* x) { float result = tanhf(x); }
__global__ void tanhf_kernel_v2(Dummy x) { float result = tanhf(x); }
__global__ void tanh_kernel_v1(double* x) { double result = tanh(x); }
__global__ void tanh_kernel_v2(Dummy x) { double result = tanh(x); }
)"};
static constexpr auto kAsinh{R"(
class Dummy {
public:
__device__ Dummy() {}
__device__ ~Dummy() {}
};
__global__ void asinhf_kernel_v1(float* x) { float result = asinhf(x); }
__global__ void asinhf_kernel_v2(Dummy x) { float result = asinhf(x); }
__global__ void asinh_kernel_v1(double* x) { double result = asinh(x); }
__global__ void asinh_kernel_v2(Dummy x) { double result = asinh(x); }
)"};
static constexpr auto kAcosh{R"(
class Dummy {
public:
__device__ Dummy() {}
__device__ ~Dummy() {}
};
__global__ void acoshf_kernel_v1(float* x) { float result = acoshf(x); }
__global__ void acoshf_kernel_v2(Dummy x) { float result = acoshf(x); }
__global__ void acosh_kernel_v1(double* x) { double result = acosh(x); }
__global__ void acosh_kernel_v2(Dummy x) { double result = acosh(x); }
)"};
static constexpr auto kAtanh{R"(
class Dummy {
public:
__device__ Dummy() {}
__device__ ~Dummy() {}
};
__global__ void atanhf_kernel_v1(float* x) { float result = atanhf(x); }
__global__ void atanhf_kernel_v2(Dummy x) { float result = atanhf(x); }
__global__ void atanh_kernel_v1(double* x) { double result = atanh(x); }
__global__ void atanh_kernel_v2(Dummy x) { double result = atanh(x); }
)"};
static constexpr auto kSinpi{R"(
class Dummy {
public:
__device__ Dummy() {}
__device__ ~Dummy() {}
};
__global__ void sinpif_kernel_v1(float* x) { float result = sinpif(x); }
__global__ void sinpif_kernel_v2(Dummy x) { float result = sinpif(x); }
__global__ void sinpi_kernel_v1(double* x) { double result = sinpi(x); }
__global__ void sinpi_kernel_v2(Dummy x) { double result = sinpi(x); }
)"};
static constexpr auto kCospi{R"(
class Dummy {
public:
__device__ Dummy() {}
__device__ ~Dummy() {}
};
__global__ void cospif_kernel_v1(float* x) { float result = cospif(x); }
__global__ void cospif_kernel_v2(Dummy x) { float result = cospif(x); }
__global__ void cospi_kernel_v1(double* x) { double result = cospi(x); }
__global__ void cospi_kernel_v2(Dummy x) { double result = cospi(x); }
)"};
static constexpr auto kAtan2{R"(
class Dummy {
public:
__device__ Dummy() {}
__device__ ~Dummy() {}
};
__global__ void atan2f_kernel_v1(float* x, float y) { float result = atan2f(x, y); }
__global__ void atan2f_kernel_v2(float x, float* y) { float result = atan2f(x, y); }
__global__ void atan2f_kernel_v3(Dummy x, float y) { float result = atan2f(x, y); }
__global__ void atan2f_kernel_v4(float x, Dummy y) { float result = atan2f(x, y); }
__global__ void atan2_kernel_v1(double* x, double y) { double result = atan2(x, y); }
__global__ void atan2_kernel_v2(double x, double* y) { double result = atan2(x, y); }
__global__ void atan2_kernel_v3(Dummy x, double y) { double result = atan2(x, y); }
__global__ void atan2_kernel_v4(double x, Dummy y) { double result = atan2(x, y); }
)"};
static constexpr auto kSincos{R"(
class Dummy {
public:
__device__ Dummy() {}
__device__ ~Dummy() {}
};
__global__ void sincosf_kernel_v1(float* x, float* sptr, float* cptr) { sincosf(x, sptr, cptr); }
__global__ void sincosf_kernel_v2(Dummy x, float* sptr, float* cptr) { sincosf(x, sptr, cptr); }
__global__ void sincosf_kernel_v3(float x, char* sptr, float* cptr) { sincosf(x, sptr, cptr); }
__global__ void sincosf_kernel_v4(float x, short* sptr, float* cptr) { sincosf(x, sptr, cptr); }
__global__ void sincosf_kernel_v5(float x, int* sptr, float* cptr) { sincosf(x, sptr, cptr); }
__global__ void sincosf_kernel_v6(float x, long* sptr, float* cptr) { sincosf(x, sptr, cptr); }
__global__ void sincosf_kernel_v7(float x, long long* sptr, float* cptr) { sincosf(x, sptr, cptr); }
__global__ void sincosf_kernel_v8(float x, double* sptr, float* cptr) { sincosf(x, sptr, cptr); }
__global__ void sincosf_kernel_v9(float x, Dummy* sptr, float* cptr) { sincosf(x, sptr, cptr); }
__global__ void sincosf_kernel_v10(float x, const float* sptr, float* cptr) {
sincosf(x, sptr, cptr);
}
__global__ void sincosf_kernel_v11(float x, float* sptr, char* cptr) { sincosf(x, sptr, cptr); }
__global__ void sincosf_kernel_v12(float x, float* sptr, short* cptr) { sincosf(x, sptr, cptr); }
__global__ void sincosf_kernel_v13(float x, float* sptr, int* cptr) { sincosf(x, sptr, cptr); }
__global__ void sincosf_kernel_v14(float x, float* sptr, long* cptr) { sincosf(x, sptr, cptr); }
__global__ void sincosf_kernel_v15(float x, float* sptr, long long* cptr) {
sincosf(x, sptr, cptr);
}
__global__ void sincosf_kernel_v16(float x, float* sptr, double* cptr) { sincosf(x, sptr, cptr); }
__global__ void sincosf_kernel_v17(float x, float* sptr, Dummy* cptr) { sincosf(x, sptr, cptr); }
__global__ void sincosf_kernel_v18(float x, float* sptr, const float* cptr) {
sincosf(x, sptr, cptr);
}
__global__ void sincos_kernel_v1(double* x, double* sptr, double* cptr) { sincos(x, sptr, cptr); }
__global__ void sincos_kernel_v2(Dummy x, double* sptr, double* cptr) { sincos(x, sptr, cptr); }
__global__ void sincos_kernel_v3(double x, char* sptr, double* cptr) { sincos(x, sptr, cptr); }
__global__ void sincos_kernel_v4(double x, short* sptr, double* cptr) { sincos(x, sptr, cptr); }
__global__ void sincos_kernel_v5(double x, int* sptr, double* cptr) { sincos(x, sptr, cptr); }
__global__ void sincos_kernel_v6(double x, long* sptr, double* cptr) { sincos(x, sptr, cptr); }
__global__ void sincos_kernel_v7(double x, long long* sptr, double* cptr) { sincos(x, sptr, cptr); }
__global__ void sincos_kernel_v8(double x, float* sptr, double* cptr) { sincos(x, sptr, cptr); }
__global__ void sincos_kernel_v9(double x, Dummy* sptr, double* cptr) { sincos(x, sptr, cptr); }
__global__ void sincos_kernel_v10(double x, const double* sptr, double* cptr) {
sincos(x, sptr, cptr);
}
__global__ void sincos_kernel_v11(double x, double* sptr, char* cptr) { sincos(x, sptr, cptr); }
__global__ void sincos_kernel_v12(double x, double* sptr, short* cptr) { sincos(x, sptr, cptr); }
__global__ void sincos_kernel_v13(double x, double* sptr, int* cptr) { sincos(x, sptr, cptr); }
__global__ void sincos_kernel_v14(double x, double* sptr, long* cptr) { sincos(x, sptr, cptr); }
__global__ void sincos_kernel_v15(double x, double* sptr, long long* cptr) {
sincos(x, sptr, cptr);
}
__global__ void sincos_kernel_v16(double x, double* sptr, float* cptr) { sincos(x, sptr, cptr); }
__global__ void sincos_kernel_v17(double x, double* sptr, Dummy* cptr) { sincos(x, sptr, cptr); }
__global__ void sincos_kernel_v18(double x, double* sptr, const double* cptr) {
sincos(x, sptr, cptr);
}
)"};
static constexpr auto kSincospi{R"(
class Dummy {
public:
__device__ Dummy() {}
__device__ ~Dummy() {}
};
__global__ void sincospif_kernel_v1(float* x, float* sptr, float* cptr) {
sincospif(x, sptr, cptr);
}
__global__ void sincospif_kernel_v2(Dummy x, float* sptr, float* cptr) { sincospif(x, sptr, cptr); }
__global__ void sincospif_kernel_v3(float x, char* sptr, float* cptr) { sincospif(x, sptr, cptr); }
__global__ void sincospif_kernel_v4(float x, short* sptr, float* cptr) { sincospif(x, sptr, cptr); }
__global__ void sincospif_kernel_v5(float x, int* sptr, float* cptr) { sincospif(x, sptr, cptr); }
__global__ void sincospif_kernel_v6(float x, long* sptr, float* cptr) { sincospif(x, sptr, cptr); }
__global__ void sincospif_kernel_v7(float x, long long* sptr, float* cptr) {
sincospif(x, sptr, cptr);
}
__global__ void sincospif_kernel_v8(float x, double* sptr, float* cptr) {
sincospif(x, sptr, cptr);
}
__global__ void sincospif_kernel_v9(float x, Dummy* sptr, float* cptr) { sincospif(x, sptr, cptr); }
__global__ void sincospif_kernel_v10(float x, const float* sptr, float* cptr) {
sincospif(x, sptr, cptr);
}
__global__ void sincospif_kernel_v11(float x, float* sptr, char* cptr) { sincospif(x, sptr, cptr); }
__global__ void sincospif_kernel_v12(float x, float* sptr, short* cptr) {
sincospif(x, sptr, cptr);
}
__global__ void sincospif_kernel_v13(float x, float* sptr, int* cptr) { sincospif(x, sptr, cptr); }
__global__ void sincospif_kernel_v14(float x, float* sptr, long* cptr) { sincospif(x, sptr, cptr); }
__global__ void sincospif_kernel_v15(float x, float* sptr, long long* cptr) {
sincospif(x, sptr, cptr);
}
__global__ void sincospif_kernel_v16(float x, float* sptr, double* cptr) {
sincospif(x, sptr, cptr);
}
__global__ void sincospif_kernel_v17(float x, float* sptr, Dummy* cptr) {
sincospif(x, sptr, cptr);
}
__global__ void sincospif_kernel_v18(float x, float* sptr, const float* cptr) {
sincospif(x, sptr, cptr);
}
__global__ void sincospi_kernel_v1(float* x, float* sptr, float* cptr) { sincospi(x, sptr, cptr); }
__global__ void sincospi_kernel_v2(Dummy x, float* sptr, float* cptr) { sincospi(x, sptr, cptr); }
__global__ void sincospi_kernel_v3(float x, char* sptr, float* cptr) { sincospi(x, sptr, cptr); }
__global__ void sincospi_kernel_v4(float x, short* sptr, float* cptr) { sincospi(x, sptr, cptr); }
__global__ void sincospi_kernel_v5(float x, int* sptr, float* cptr) { sincospi(x, sptr, cptr); }
__global__ void sincospi_kernel_v6(float x, long* sptr, float* cptr) { sincospi(x, sptr, cptr); }
__global__ void sincospi_kernel_v7(float x, long long* sptr, float* cptr) {
sincospi(x, sptr, cptr);
}
__global__ void sincospi_kernel_v8(float x, double* sptr, float* cptr) { sincospi(x, sptr, cptr); }
__global__ void sincospi_kernel_v9(float x, Dummy* sptr, float* cptr) { sincospi(x, sptr, cptr); }
__global__ void sincospi_kernel_v10(float x, const float* sptr, float* cptr) {
sincospi(x, sptr, cptr);
}
__global__ void sincospi_kernel_v11(float x, float* sptr, char* cptr) { sincospi(x, sptr, cptr); }
__global__ void sincospi_kernel_v12(float x, float* sptr, short* cptr) { sincospi(x, sptr, cptr); }
__global__ void sincospi_kernel_v13(float x, float* sptr, int* cptr) { sincospi(x, sptr, cptr); }
__global__ void sincospi_kernel_v14(float x, float* sptr, long* cptr) { sincospi(x, sptr, cptr); }
__global__ void sincospi_kernel_v15(float x, float* sptr, long long* cptr) {
sincospi(x, sptr, cptr);
}
__global__ void sincospi_kernel_v16(float x, float* sptr, double* cptr) { sincospi(x, sptr, cptr); }
__global__ void sincospi_kernel_v17(float x, float* sptr, Dummy* cptr) { sincospi(x, sptr, cptr); }
__global__ void sincospi_kernel_v18(float x, float* sptr, const float* cptr) {
sincospi(x, sptr, cptr);
}
)"};
@@ -0,0 +1,118 @@
/*
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip_test_common.hh>
class Dummy {
public:
__device__ Dummy() {}
__device__ ~Dummy() {}
};
#define TRIG_SP_UNARY_NEGATIVE_KERNELS(func_name) \
__global__ void func_name##f_kernel_v1(float* x) { float result = func_name##f(x); } \
__global__ void func_name##f_kernel_v2(Dummy x) { float result = func_name##f(x); }
/*Expecting 2 errors per macro invocation - 26 total*/
TRIG_SP_UNARY_NEGATIVE_KERNELS(sin)
TRIG_SP_UNARY_NEGATIVE_KERNELS(cos)
TRIG_SP_UNARY_NEGATIVE_KERNELS(tan)
TRIG_SP_UNARY_NEGATIVE_KERNELS(asin)
TRIG_SP_UNARY_NEGATIVE_KERNELS(acos)
TRIG_SP_UNARY_NEGATIVE_KERNELS(atan)
TRIG_SP_UNARY_NEGATIVE_KERNELS(sinh)
TRIG_SP_UNARY_NEGATIVE_KERNELS(cosh)
TRIG_SP_UNARY_NEGATIVE_KERNELS(tanh)
TRIG_SP_UNARY_NEGATIVE_KERNELS(asinh)
TRIG_SP_UNARY_NEGATIVE_KERNELS(atanh)
TRIG_SP_UNARY_NEGATIVE_KERNELS(sinpi)
TRIG_SP_UNARY_NEGATIVE_KERNELS(cospi)
/*Expecting 4 errors*/
__global__ void atan2f_kernel_v1(float* x, float y) { float result = atan2f(x, y); }
__global__ void atan2f_kernel_v2(float x, float* y) { float result = atan2f(x, y); }
__global__ void atan2f_kernel_v3(Dummy x, float y) { float result = atan2f(x, y); }
__global__ void atan2f_kernel_v4(float x, Dummy y) { float result = atan2f(x, y); }
/*Expecting 18 errors*/
__global__ void sincosf_kernel_v1(float* x, float* sptr, float* cptr) { sincosf(x, sptr, cptr); }
__global__ void sincosf_kernel_v2(Dummy x, float* sptr, float* cptr) { sincosf(x, sptr, cptr); }
__global__ void sincosf_kernel_v3(float x, char* sptr, float* cptr) { sincosf(x, sptr, cptr); }
__global__ void sincosf_kernel_v4(float x, short* sptr, float* cptr) { sincosf(x, sptr, cptr); }
__global__ void sincosf_kernel_v5(float x, int* sptr, float* cptr) { sincosf(x, sptr, cptr); }
__global__ void sincosf_kernel_v6(float x, long* sptr, float* cptr) { sincosf(x, sptr, cptr); }
__global__ void sincosf_kernel_v7(float x, long long* sptr, float* cptr) { sincosf(x, sptr, cptr); }
__global__ void sincosf_kernel_v8(float x, double* sptr, float* cptr) { sincosf(x, sptr, cptr); }
__global__ void sincosf_kernel_v9(float x, Dummy* sptr, float* cptr) { sincosf(x, sptr, cptr); }
__global__ void sincosf_kernel_v10(float x, const float* sptr, float* cptr) {
sincosf(x, sptr, cptr);
}
__global__ void sincosf_kernel_v11(float x, float* sptr, char* cptr) { sincosf(x, sptr, cptr); }
__global__ void sincosf_kernel_v12(float x, float* sptr, short* cptr) { sincosf(x, sptr, cptr); }
__global__ void sincosf_kernel_v13(float x, float* sptr, int* cptr) { sincosf(x, sptr, cptr); }
__global__ void sincosf_kernel_v14(float x, float* sptr, long* cptr) { sincosf(x, sptr, cptr); }
__global__ void sincosf_kernel_v15(float x, float* sptr, long long* cptr) {
sincosf(x, sptr, cptr);
}
__global__ void sincosf_kernel_v16(float x, float* sptr, double* cptr) { sincosf(x, sptr, cptr); }
__global__ void sincosf_kernel_v17(float x, float* sptr, Dummy* cptr) { sincosf(x, sptr, cptr); }
__global__ void sincosf_kernel_v18(float x, float* sptr, const float* cptr) {
sincosf(x, sptr, cptr);
}
/*Expecting 18 errors*/
__global__ void sincospif_kernel_v1(float* x, float* sptr, float* cptr) {
sincospif(x, sptr, cptr);
}
__global__ void sincospif_kernel_v2(Dummy x, float* sptr, float* cptr) { sincospif(x, sptr, cptr); }
__global__ void sincospif_kernel_v3(float x, char* sptr, float* cptr) { sincospif(x, sptr, cptr); }
__global__ void sincospif_kernel_v4(float x, short* sptr, float* cptr) { sincospif(x, sptr, cptr); }
__global__ void sincospif_kernel_v5(float x, int* sptr, float* cptr) { sincospif(x, sptr, cptr); }
__global__ void sincospif_kernel_v6(float x, long* sptr, float* cptr) { sincospif(x, sptr, cptr); }
__global__ void sincospif_kernel_v7(float x, long long* sptr, float* cptr) {
sincospif(x, sptr, cptr);
}
__global__ void sincospif_kernel_v8(float x, double* sptr, float* cptr) {
sincospif(x, sptr, cptr);
}
__global__ void sincospif_kernel_v9(float x, Dummy* sptr, float* cptr) { sincospif(x, sptr, cptr); }
__global__ void sincospif_kernel_v10(float x, const float* sptr, float* cptr) {
sincospif(x, sptr, cptr);
}
__global__ void sincospif_kernel_v11(float x, float* sptr, char* cptr) { sincospif(x, sptr, cptr); }
__global__ void sincospif_kernel_v12(float x, float* sptr, short* cptr) {
sincospif(x, sptr, cptr);
}
__global__ void sincospif_kernel_v13(float x, float* sptr, int* cptr) { sincospif(x, sptr, cptr); }
__global__ void sincospif_kernel_v14(float x, float* sptr, long* cptr) { sincospif(x, sptr, cptr); }
__global__ void sincospif_kernel_v15(float x, float* sptr, long long* cptr) {
sincospif(x, sptr, cptr);
}
__global__ void sincospif_kernel_v16(float x, float* sptr, double* cptr) {
sincospif(x, sptr, cptr);
}
__global__ void sincospif_kernel_v17(float x, float* sptr, Dummy* cptr) {
sincospif(x, sptr, cptr);
}
__global__ void sincospif_kernel_v18(float x, float* sptr, const float* cptr) {
sincospif(x, sptr, cptr);
}
+198
Ver ficheiro
@@ -0,0 +1,198 @@
/*
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include "math_common.hh"
#include "math_special_values.hh"
#include <hip/hip_cooperative_groups.h>
namespace cg = cooperative_groups;
#define MATH_UNARY_KERNEL_DEF(func_name) \
template <typename T, typename RT = T> \
__global__ void func_name##_kernel(RT* const ys, const size_t num_xs, T* const xs) { \
const auto tid = cg::this_grid().thread_rank(); \
const auto stride = cg::this_grid().size(); \
\
for (auto i = tid; i < num_xs; i += stride) { \
if constexpr (std::is_same_v<float, T>) { \
ys[i] = func_name##f(xs[i]); \
} else if constexpr (std::is_same_v<double, T>) { \
ys[i] = func_name(xs[i]); \
} \
} \
}
template <typename T, typename RT, typename RTArg, typename ValidatorBuilder>
void UnarySinglePrecisionBruteForceTest(kernel_sig<T, float> kernel, ref_sig<RT, RTArg> ref_func,
const ValidatorBuilder& validator_builder) {
const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
uint64_t stop = std::numeric_limits<uint32_t>::max() + 1ul;
const auto max_batch_size =
std::min(GetMaxAllowedDeviceMemoryUsage() / (sizeof(float) + sizeof(T)), stop);
LinearAllocGuard<float> values{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(float)};
MathTest math_test(kernel, max_batch_size);
auto batch_size = max_batch_size;
const auto num_threads = thread_pool.thread_count();
for (uint64_t v = 0u; v < stop;) {
batch_size = std::min<uint64_t>(max_batch_size, stop - v);
const auto min_sub_batch_size = batch_size / num_threads;
const auto tail = batch_size % num_threads;
auto base_idx = 0u;
for (auto i = 0u; i < num_threads; ++i) {
const auto sub_batch_size = min_sub_batch_size + (i < tail);
thread_pool.Post([=, &values] {
auto t = v;
uint32_t val;
for (auto j = 0u; j < sub_batch_size; ++j) {
val = static_cast<uint32_t>(t++);
values.ptr()[base_idx + j] = *reinterpret_cast<float*>(&val);
}
});
v += sub_batch_size;
base_idx += sub_batch_size;
}
thread_pool.Wait();
math_test.Run(validator_builder, grid_size, block_size, ref_func, batch_size, values.ptr());
}
}
template <typename T, typename RT, typename RTArg, typename ValidatorBuilder>
void UnarySinglePrecisionRangeTest(kernel_sig<T, float> kernel, ref_sig<RT, RTArg> ref_func,
const ValidatorBuilder& validator_builder, const float a,
const float b) {
const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
uint64_t stop = std::numeric_limits<uint32_t>::max() + 1ul;
const auto max_batch_size = GetMaxAllowedDeviceMemoryUsage() / (sizeof(float) + sizeof(T));
LinearAllocGuard<float> values{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(float)};
MathTest math_test(kernel, max_batch_size);
uint32_t val = 0u;
const auto num_threads = thread_pool.thread_count();
size_t inserted = 0u;
for (float v = a; v != b; v = std::nextafter(v, b)) {
values.ptr()[inserted++] = v;
if (inserted < max_batch_size) continue;
math_test.Run(validator_builder, grid_size, block_size, ref_func, inserted, values.ptr());
inserted = 0u;
}
}
template <typename T, typename RT, typename RTArg, typename ValidatorBuilder>
void UnaryDoublePrecisionBruteForceTest(kernel_sig<T, double> kernel, ref_sig<RT, RTArg> ref_func,
const ValidatorBuilder& validator_builder,
const double a = std::numeric_limits<double>::lowest(),
const double b = std::numeric_limits<double>::max()) {
const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
const uint64_t num_iterations = GetTestIterationCount();
const auto max_batch_size =
std::min(GetMaxAllowedDeviceMemoryUsage() / (sizeof(double) + sizeof(T)), num_iterations);
LinearAllocGuard<double> values{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(double)};
MathTest math_test(kernel, max_batch_size);
auto batch_size = max_batch_size;
const auto num_threads = thread_pool.thread_count();
for (uint64_t i = 0ul; i < num_iterations; i += batch_size) {
batch_size = std::min<uint64_t>(max_batch_size, num_iterations - i);
const auto min_sub_batch_size = batch_size / num_threads;
const auto tail = batch_size % num_threads;
auto base_idx = 0u;
for (auto i = 0u; i < num_threads; ++i) {
const auto sub_batch_size = min_sub_batch_size + (i < tail);
thread_pool.Post([=, &values] {
const auto generator = [=] {
static thread_local std::mt19937 rng(std::random_device{}());
std::uniform_real_distribution<long double> unif_dist(a, b);
return static_cast<double>(unif_dist(rng));
};
std::generate(values.ptr() + base_idx, values.ptr() + base_idx + sub_batch_size, generator);
});
base_idx += sub_batch_size;
}
thread_pool.Wait();
math_test.Run(validator_builder, grid_size, block_size, ref_func, batch_size, values.ptr());
}
}
template <typename T, typename RT, typename RTArg, typename ValidatorBuilder>
void UnaryDoublePrecisionSpecialValuesTest(kernel_sig<T, double> kernel,
ref_sig<RT, RTArg> ref_func,
const ValidatorBuilder& validator_builder) {
const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
const auto values = std::get<SpecialVals<double>>(kSpecialValRegistry);
MathTest math_test(kernel, values.size);
math_test.template Run<false>(validator_builder, grid_size, block_size, ref_func, values.size,
values.data);
}
template <typename T, typename RT, typename RTArg, typename ValidatorBuilder>
void UnarySinglePrecisionTest(kernel_sig<T, float> kernel, ref_sig<RT, RTArg> ref,
const ValidatorBuilder& validator_builder) {
SECTION("Brute force") { UnarySinglePrecisionBruteForceTest(kernel, ref, validator_builder); }
}
template <typename T, typename RT, typename RTArg, typename ValidatorBuilder>
void UnaryDoublePrecisionTest(kernel_sig<T, double> kernel, ref_sig<RT, RTArg> ref,
const ValidatorBuilder& validator_builder) {
SECTION("Special values") {
UnaryDoublePrecisionSpecialValuesTest(kernel, ref, validator_builder);
}
SECTION("Brute force") { UnaryDoublePrecisionBruteForceTest(kernel, ref, validator_builder); }
}
#define MATH_UNARY_WITHIN_ULP_TEST_DEF(kern_name, ref_func, sp_ulp, dp_ulp) \
MATH_UNARY_KERNEL_DEF(kern_name) \
\
TEST_CASE("Unit_Device_" #kern_name "_Accuracy_Positive - float") { \
double (*ref)(double) = ref_func; \
UnarySinglePrecisionTest(kern_name##_kernel<float>, ref, \
ULPValidatorBuilderFactory<float>(sp_ulp)); \
} \
\
TEST_CASE("Unit_Device_" #kern_name "_Accuracy_Positive - double") { \
long double (*ref)(long double) = ref_func; \
UnaryDoublePrecisionTest(kern_name##_kernel<double>, ref, \
ULPValidatorBuilderFactory<double>(dp_ulp)); \
}
#define MATH_UNARY_WITHIN_ULP_STL_REF_TEST_DEF(func_name, sp_ulp, dp_ulp) \
MATH_UNARY_WITHIN_ULP_TEST_DEF(func_name, std::func_name, sp_ulp, dp_ulp)
+152
Ver ficheiro
@@ -0,0 +1,152 @@
/*
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include <catch.hpp>
// Define a new MatcherBase class with a public 'describe' member function because
// Catch::MatcherBase::describe is protected and thus can't be used via a pointer to
// Catch::MatcherBase.
template <typename T> class MatcherBase : public Catch::MatcherBase<T> {
public:
virtual std::string describe() const = 0;
virtual ~MatcherBase() = default;
};
template <typename T, typename Matcher> class ValidatorBase : public MatcherBase<T> {
public:
template <typename... Ts>
ValidatorBase(T target, Ts&&... args) : matcher_{std::forward<Ts>(args)...}, target_{target} {}
bool match(const T& val) const override {
if (std::isnan(target_)) {
return std::isnan(val);
}
return matcher_.match(val);
}
std::string describe() const override {
if (std::isnan(target_)) {
return "is not NaN";
}
return matcher_.describe();
}
private:
Matcher matcher_;
T target_;
bool nan = false;
};
template <typename T> auto ULPValidatorBuilderFactory(int64_t ulps) {
return [=](T target, auto&&... args) {
return std::make_unique<ValidatorBase<T, Catch::Matchers::Floating::WithinUlpsMatcher>>(
target, Catch::WithinULP(target, ulps));
};
};
template <typename T> auto AbsValidatorBuilderFactory(double margin) {
return [=](T target, auto&&... args) {
return std::make_unique<ValidatorBase<T, Catch::Matchers::Floating::WithinAbsMatcher>>(
target, Catch::WithinAbs(target, margin));
};
}
template <typename T> auto RelValidatorBuilderFactory(T margin) {
return [=](T target, auto&&... args) {
return std::make_unique<ValidatorBase<T, Catch::Matchers::Floating::WithinRelMatcher>>(
target, Catch::WithinRel(target, margin));
};
}
template <typename T> class EqValidator : public MatcherBase<T> {
public:
EqValidator(T target) : target_{target} {}
bool match(const T& val) const override {
if (std::isnan(target_)) {
return std::isnan(val);
}
return target_ == val;
}
std::string describe() const override {
std::stringstream ss;
ss << " is not equal to " << target_;
return ss.str();
}
private:
T target_;
};
template <typename T> auto EqValidatorBuilderFactory() {
return [](T val, auto&&... args) { return std::make_unique<EqValidator<T>>(val); };
}
template <typename T, typename U, typename VBF, typename VBS>
class PairValidator : public MatcherBase<std::pair<T, U>> {
public:
PairValidator(const std::pair<T, U>& target, const VBF& vbf, const VBS& vbs)
: first_matcher_{vbf(target.first)}, second_matcher_{vbs(target.second)} {}
bool match(const std::pair<T, U>& val) const override {
return first_matcher_->match(val.first) && second_matcher_->match(val.second);
}
std::string describe() const override {
return "<" + first_matcher_->describe() + ", " + second_matcher_->describe() + ">";
}
private:
decltype(std::declval<VBF>()(std::declval<T>())) first_matcher_;
decltype(std::declval<VBS>()(std::declval<U>())) second_matcher_;
};
template <typename T, typename ValidatorBuilder>
auto PairValidatorBuilderFactory(const ValidatorBuilder& vb) {
return [=](const std::pair<T, T>& t, auto&&... args) {
return std::make_unique<PairValidator<T, T, ValidatorBuilder, ValidatorBuilder>>(t, vb, vb);
};
}
template <typename T, typename U, typename VBF, typename VBS>
auto PairValidatorBuilderFactory(const VBF& vbf, const VBS& vbs) {
return [=](const std::pair<T, U>& t, auto&&... args) {
return std::make_unique<PairValidator<T, U, VBF, VBS>>(t, vbf, vbs);
};
}
template <typename T> class NopValidator : public MatcherBase<T> {
public:
bool match(const T& val) const override { return true; }
std::string describe() const override { return ""; }
};
template <typename T> auto NopValidatorBuilderFactory() {
return [](auto&&... args) { return std::make_unique<NopValidator<T>>(); };
}