EXSWHTEC-287 - Implement tests for trigonometric device math functions #231
Change-Id: I44188fa2f60f2492e05596464c914b8c739be482
[ROCm/hip-tests commit: 36620358e6]
Este cometimento está contido em:
cometido por
Rakesh Roy
ascendente
db33e97368
cometimento
e06f1302bf
@@ -19,8 +19,19 @@
|
||||
# THE SOFTWARE.
|
||||
|
||||
set(TEST_SRC
|
||||
trig_funcs.cc
|
||||
)
|
||||
|
||||
hip_add_exe_to_target(NAME MathsTest
|
||||
TEST_SRC ${TEST_SRC}
|
||||
TEST_TARGET_NAME build_tests COMMON_SHARED_SRC ${COMMON_SHARED_SRC})
|
||||
|
||||
add_test(NAME Unit_Device_Single_Precision_Trig_Functions_Negative
|
||||
COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../compileAndCaptureOutput.py
|
||||
${CMAKE_CURRENT_SOURCE_DIR} ${HIP_PLATFORM} ${HIP_PATH}
|
||||
trig_single_precision_negative_kernels.cc 66)
|
||||
|
||||
add_test(NAME Unit_Device_Double_Precision_Trig_Functions_Negative
|
||||
COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../compileAndCaptureOutput.py
|
||||
${CMAKE_CURRENT_SOURCE_DIR} ${HIP_PLATFORM} ${HIP_PATH}
|
||||
trig_double_precision_negative_kernels.cc 66)
|
||||
@@ -0,0 +1,136 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "math_common.hh"
|
||||
#include "math_special_values.hh"
|
||||
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
#define MATH_BINARY_KERNEL_DEF(func_name) \
|
||||
template <typename T, typename RT = T> \
|
||||
__global__ void func_name##_kernel(RT* const ys, const size_t num_xs, T* const x1s, \
|
||||
T* const x2s) { \
|
||||
const auto tid = cg::this_grid().thread_rank(); \
|
||||
const auto stride = cg::this_grid().size(); \
|
||||
\
|
||||
for (auto i = tid; i < num_xs; i += stride) { \
|
||||
if constexpr (std::is_same_v<float, T>) { \
|
||||
ys[i] = func_name##f(x1s[i], x2s[i]); \
|
||||
} else if constexpr (std::is_same_v<double, T>) { \
|
||||
ys[i] = func_name(x1s[i], x2s[i]); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
template <typename T, typename TArg, typename RT, typename RTArg, typename ValidatorBuilder>
|
||||
void BinaryFloatingPointBruteForceTest(kernel_sig<T, TArg, TArg> kernel,
|
||||
ref_sig<RT, RTArg, RTArg> ref_func,
|
||||
const ValidatorBuilder& validator_builder,
|
||||
const TArg a = std::numeric_limits<TArg>::lowest(),
|
||||
const TArg b = std::numeric_limits<TArg>::max()) {
|
||||
const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
|
||||
const uint64_t num_iterations = GetTestIterationCount();
|
||||
const auto max_batch_size =
|
||||
std::min(GetMaxAllowedDeviceMemoryUsage() / (sizeof(TArg) * 2 + sizeof(T)), num_iterations);
|
||||
LinearAllocGuard<TArg> x1s{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(TArg)};
|
||||
LinearAllocGuard<TArg> x2s{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(TArg)};
|
||||
|
||||
MathTest math_test(kernel, max_batch_size);
|
||||
|
||||
auto batch_size = max_batch_size;
|
||||
const auto num_threads = thread_pool.thread_count();
|
||||
for (uint64_t i = 0ul; i < num_iterations; i += batch_size) {
|
||||
batch_size = std::min<uint64_t>(max_batch_size, num_iterations - i);
|
||||
|
||||
const auto min_sub_batch_size = batch_size / num_threads;
|
||||
const auto tail = batch_size % num_threads;
|
||||
|
||||
auto base_idx = 0u;
|
||||
for (auto i = 0u; i < num_threads; ++i) {
|
||||
const auto sub_batch_size = min_sub_batch_size + (i < tail);
|
||||
thread_pool.Post([=, &x1s, &x2s] {
|
||||
const auto generator = [=] {
|
||||
static thread_local std::mt19937 rng(std::random_device{}());
|
||||
std::uniform_real_distribution<RefType_t<TArg>> unif_dist(a, b);
|
||||
return static_cast<TArg>(unif_dist(rng));
|
||||
};
|
||||
std::generate(x1s.ptr() + base_idx, x1s.ptr() + base_idx + sub_batch_size, generator);
|
||||
std::generate(x2s.ptr() + base_idx, x2s.ptr() + base_idx + sub_batch_size, generator);
|
||||
});
|
||||
base_idx += sub_batch_size;
|
||||
}
|
||||
|
||||
thread_pool.Wait();
|
||||
|
||||
math_test.Run(validator_builder, grid_size, block_size, ref_func, batch_size, x1s.ptr(),
|
||||
x2s.ptr());
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename TArg, typename RT, typename RTArg, typename ValidatorBuilder>
|
||||
void BinaryFloatingPointSpecialValuesTest(kernel_sig<T, TArg, TArg> kernel,
|
||||
ref_sig<RT, RTArg, RTArg> ref_func,
|
||||
const ValidatorBuilder& validator_builder) {
|
||||
const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
|
||||
const auto values = std::get<SpecialVals<TArg>>(kSpecialValRegistry);
|
||||
|
||||
const auto size = values.size * values.size;
|
||||
LinearAllocGuard<TArg> x1s{LinearAllocs::hipHostMalloc, size * sizeof(TArg)};
|
||||
LinearAllocGuard<TArg> x2s{LinearAllocs::hipHostMalloc, size * sizeof(TArg)};
|
||||
|
||||
for (auto i = 0u; i < values.size; ++i) {
|
||||
for (auto j = 0u; j < values.size; ++j) {
|
||||
x1s.ptr()[i * values.size + j] = values.data[i];
|
||||
x2s.ptr()[i * values.size + j] = values.data[j];
|
||||
}
|
||||
}
|
||||
|
||||
MathTest math_test(kernel, size);
|
||||
math_test.template Run<false>(validator_builder, grid_size, block_size, ref_func, size, x1s.ptr(),
|
||||
x2s.ptr());
|
||||
}
|
||||
|
||||
template <typename T, typename TArg, typename RT, typename RTArg, typename ValidatorBuilder>
|
||||
void BinaryFloatingPointTest(kernel_sig<T, TArg, TArg> kernel, ref_sig<RT, RTArg, RTArg> ref_func,
|
||||
const ValidatorBuilder& validator_builder) {
|
||||
SECTION("Special values") {
|
||||
BinaryFloatingPointSpecialValuesTest(kernel, ref_func, validator_builder);
|
||||
}
|
||||
|
||||
SECTION("Brute force") { BinaryFloatingPointBruteForceTest(kernel, ref_func, validator_builder); }
|
||||
}
|
||||
|
||||
|
||||
#define MATH_BINARY_WITHIN_ULP_TEST_DEF(kern_name, ref_func, sp_ulp, dp_ulp) \
|
||||
MATH_BINARY_KERNEL_DEF(kern_name) \
|
||||
\
|
||||
TEMPLATE_TEST_CASE("Unit_Device_" #kern_name "_Accuracy_Positive", "", float, double) { \
|
||||
using RT = RefType_t<TestType>; \
|
||||
RT (*ref)(RT, RT) = ref_func; \
|
||||
const auto ulp = std::is_same_v<float, TestType> ? sp_ulp : dp_ulp; \
|
||||
\
|
||||
BinaryFloatingPointTest(kern_name##_kernel<TestType>, ref, \
|
||||
ULPValidatorBuilderFactory<TestType>(ulp)); \
|
||||
}
|
||||
@@ -0,0 +1,240 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cmd_options.hh>
|
||||
#include <hip_test_common.hh>
|
||||
#include <resource_guards.hh>
|
||||
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
|
||||
#include "thread_pool.hh"
|
||||
#include "validators.hh"
|
||||
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
template <typename T, typename U>
|
||||
std::enable_if_t<std::conjunction_v<std::is_arithmetic<T>, std::is_arithmetic<U>>, std::ostream&>
|
||||
operator<<(std::ostream& os, const std::pair<T, U>& p) {
|
||||
const auto default_prec = os.precision();
|
||||
return os << "<" << std::setprecision(std::numeric_limits<T>::max_digits10 - 1) << p.first << ", "
|
||||
<< std::setprecision(std::numeric_limits<U>::max_digits10 - 1) << p.second << ">"
|
||||
<< std::setprecision(default_prec);
|
||||
}
|
||||
|
||||
// This class represents a generic numerical accuracy math test. Template parameter T is the output
|
||||
// type of the function being tested, and template parameter pack Ts represents the input types. The
|
||||
// constructor takes a kernel with the signature void(T*, const size_t, Ts*...). The first kernel
|
||||
// parameter is the output array, the second parameter is the number of outputs, and the rest of the
|
||||
// parameters are arrays containing input values. The number of input arrays depends on the arity of
|
||||
// the function being tested e.g. one input array for unary functions, two input arrays for binary
|
||||
// functions, etc. The kernel threads take one element from each input array at the index
|
||||
// corresponding to that thread, feed the input elements to the testee function, and store the
|
||||
// result in the output array at the corresponding index.
|
||||
//
|
||||
// E.g. for a binary function the kernel would have the following signature:
|
||||
// void kernel(float* y, const size_t n, float* x1, float* x2)
|
||||
//
|
||||
// The outputs would be calculated in parallel the following way:
|
||||
// y[0] = testee(x1[0], x2[0])
|
||||
// y[1] = testee(x1[1], x2[1])
|
||||
// y[2] = testee(x1[2], x2[2])
|
||||
// ...
|
||||
//
|
||||
// The constructor also takes max_num_args, which represents the maximum number of input values used
|
||||
// for one kernel launch. The device memory for the input and output arrays is allocated based on
|
||||
// that number.
|
||||
template <typename T, typename... Ts> class MathTest {
|
||||
public:
|
||||
MathTest(void (*kernel)(T*, const size_t, Ts*...), const size_t max_num_args)
|
||||
: kernel_{kernel},
|
||||
xss_dev_(LinearAllocGuard<Ts>(LinearAllocs::hipMalloc, max_num_args * sizeof(Ts))...),
|
||||
y_dev_{LinearAllocs::hipMalloc, max_num_args * sizeof(T)},
|
||||
y_{LinearAllocs::hipHostMalloc, max_num_args * sizeof(T)} {}
|
||||
|
||||
// This method runs the test with the following steps:
|
||||
// 1. Copy the values from the input arrays provided in the parameter pack xss to device memory
|
||||
// 2. Launch the kernel using the configuration provided in grid_dims and block_dims
|
||||
// 3. Copy the outputs back to host memory
|
||||
// 4. Generate the reference values using ref_func and compare against the outputs using the
|
||||
// validator provided by validator_builder
|
||||
// 5. If non-type template parameter parallel is true, then step 4 is broken up into chunks of
|
||||
// work that are done in parallel on the host.
|
||||
template <bool parallel = true, typename RT, typename ValidatorBuilder, typename... RTs>
|
||||
void Run(const ValidatorBuilder& validator_builder, const size_t grid_dims,
|
||||
const size_t block_dims, RT (*const ref_func)(RTs...), const size_t num_args,
|
||||
const Ts*... xss) {
|
||||
fail_flag_.store(false);
|
||||
error_info_.clear();
|
||||
RunImpl<parallel>(validator_builder, grid_dims, block_dims, ref_func, num_args,
|
||||
std::index_sequence_for<Ts...>{}, xss...);
|
||||
}
|
||||
|
||||
private:
|
||||
void (*kernel_)(T*, const size_t, Ts*...);
|
||||
std::tuple<LinearAllocGuard<Ts>...> xss_dev_;
|
||||
LinearAllocGuard<T> y_dev_;
|
||||
LinearAllocGuard<T> y_;
|
||||
std::atomic<bool> fail_flag_{false};
|
||||
std::mutex mtx_;
|
||||
std::string error_info_;
|
||||
|
||||
template <bool parallel, typename RT, typename ValidatorBuilder, typename... RTs, size_t... I>
|
||||
void RunImpl(const ValidatorBuilder& validator_builder, const size_t grid_dim,
|
||||
const size_t block_dim, RT (*const ref_func)(RTs...), const size_t num_args,
|
||||
std::index_sequence<I...> is, const Ts*... xss) {
|
||||
const auto xss_tup = std::make_tuple(xss...);
|
||||
|
||||
constexpr auto f = [](auto dst, auto src, size_t size) {
|
||||
HIP_CHECK(hipMemcpy(dst, src, size, hipMemcpyHostToDevice))
|
||||
};
|
||||
|
||||
((f(std::get<I>(xss_dev_).ptr(), std::get<I>(xss_tup),
|
||||
num_args * sizeof(*std::get<I>(xss_tup)))),
|
||||
...);
|
||||
|
||||
kernel_<<<grid_dim, block_dim>>>(y_dev_.ptr(), num_args, std::get<I>(xss_dev_).ptr()...);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
|
||||
HIP_CHECK(hipMemcpy(y_.ptr(), y_dev_.ptr(), num_args * sizeof(T), hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipStreamSynchronize(nullptr));
|
||||
|
||||
if constexpr (!parallel) {
|
||||
for (auto i = 0u; i < num_args; ++i) {
|
||||
const auto actual_val = y_.ptr()[i];
|
||||
const auto ref_val = static_cast<T>(ref_func(xss[i]...));
|
||||
const auto validator = validator_builder(ref_val, xss[i]...);
|
||||
|
||||
if (!validator->match(actual_val)) {
|
||||
const auto log = MakeLogMessage(actual_val, xss[i]...) + validator->describe() + "\n";
|
||||
INFO(log);
|
||||
REQUIRE(false);
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
const auto task = [&, this](size_t iters, size_t base_idx) {
|
||||
for (auto i = 0u; i < iters; ++i) {
|
||||
if (fail_flag_.load(std::memory_order_relaxed)) return;
|
||||
|
||||
const auto actual_val = y_.ptr()[base_idx + i];
|
||||
const auto ref_val = static_cast<T>(ref_func(xss[base_idx + i]...));
|
||||
const auto validator = validator_builder(ref_val, xss[base_idx + i]...);
|
||||
|
||||
if (!validator->match(actual_val)) {
|
||||
fail_flag_.store(true, std::memory_order_relaxed);
|
||||
// Several threads might have passed the first check, but failed validation. On the
|
||||
// chance of this happening, access to the string stream must be serialized.
|
||||
const auto log =
|
||||
MakeLogMessage(actual_val, xss[base_idx + i]...) + validator->describe() + "\n";
|
||||
{
|
||||
std::lock_guard lg{mtx_};
|
||||
error_info_ += log;
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
const auto task_count = thread_pool.thread_count();
|
||||
const auto chunk_size = num_args / task_count;
|
||||
const auto tail = num_args % task_count;
|
||||
|
||||
auto base_idx = 0u;
|
||||
for (auto i = 0u; i < task_count; ++i) {
|
||||
const auto iters = chunk_size + (i < tail);
|
||||
thread_pool.Post([=, &task] { task(iters, base_idx); });
|
||||
base_idx += iters;
|
||||
}
|
||||
|
||||
thread_pool.Wait();
|
||||
|
||||
INFO(error_info_);
|
||||
REQUIRE(!fail_flag_);
|
||||
}
|
||||
|
||||
template <typename... Args> std::string MakeLogMessage(T actual_val, Args... args) {
|
||||
std::stringstream ss;
|
||||
ss << "Input value(s): " << std::scientific
|
||||
<< std::setprecision(std::numeric_limits<T>::max_digits10 - 1);
|
||||
((ss << " " << args), ...) << "\n" << actual_val << " ";
|
||||
|
||||
return ss.str();
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> struct RefType {};
|
||||
|
||||
template <> struct RefType<float> { using type = double; };
|
||||
|
||||
template <> struct RefType<double> { using type = long double; };
|
||||
|
||||
template <typename T> using RefType_t = typename RefType<T>::type;
|
||||
|
||||
template <typename F> auto GetOccupancyMaxPotentialBlockSize(F kernel) {
|
||||
int grid_size = 0, block_size = 0;
|
||||
HIP_CHECK(hipOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, 0, 0));
|
||||
return std::make_tuple(grid_size, block_size);
|
||||
}
|
||||
|
||||
inline size_t GetMaxAllowedDeviceMemoryUsage() {
|
||||
hipDeviceProp_t props;
|
||||
HIP_CHECK(hipGetDeviceProperties(&props, 0));
|
||||
return props.totalGlobalMem * (cmd_options.accuracy_max_memory * 0.01f);
|
||||
}
|
||||
|
||||
inline uint64_t GetTestIterationCount() { return cmd_options.accuracy_iterations; }
|
||||
|
||||
template <typename T, typename... Ts> using kernel_sig = void (*)(T*, const size_t, Ts*...);
|
||||
|
||||
template <typename T, typename... Ts> using ref_sig = T (*)(Ts...);
|
||||
|
||||
template <int error_num> void NegativeTestRTCWrapper(const char* program_source) {
|
||||
hiprtcProgram program{};
|
||||
|
||||
HIPRTC_CHECK(
|
||||
hiprtcCreateProgram(&program, program_source, "math_test_rtc.cc", 0, nullptr, nullptr));
|
||||
hiprtcResult result{hiprtcCompileProgram(program, 0, nullptr)};
|
||||
|
||||
// Get the compile log and count compiler error messages
|
||||
size_t log_size{};
|
||||
HIPRTC_CHECK(hiprtcGetProgramLogSize(program, &log_size));
|
||||
std::string log(log_size, ' ');
|
||||
HIPRTC_CHECK(hiprtcGetProgramLog(program, log.data()));
|
||||
int error_count{0};
|
||||
|
||||
int expected_error_count{error_num};
|
||||
std::string error_message{"error:"};
|
||||
|
||||
size_t n_pos = log.find(error_message, 0);
|
||||
while (n_pos != std::string::npos) {
|
||||
++error_count;
|
||||
n_pos = log.find(error_message, n_pos + 1);
|
||||
}
|
||||
|
||||
HIPRTC_CHECK(hiprtcDestroyProgram(&program));
|
||||
HIPRTC_CHECK_ERROR(result, HIPRTC_ERROR_COMPILATION);
|
||||
REQUIRE(error_count == expected_error_count);
|
||||
}
|
||||
@@ -0,0 +1,287 @@
|
||||
//
|
||||
// Copyright (c) 2017 The Khronos Group Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
|
||||
// Disclaimer:
|
||||
// This code is based on the work found in OpenCL-CTS authored by The Khronos Group.
|
||||
// The original code can be found at https://github.com/KhronosGroup/OpenCL-CTS.
|
||||
// We acknowledge the contributions of The Khronos Group to the development of this code.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <limits>
|
||||
|
||||
/*-----------------------------------------------------------------------------
|
||||
HEX_FLT, HEXT_DBL, HEX_LDBL -- Create hex floating point literal of type
|
||||
float, double, long double respectively. Arguments:
|
||||
|
||||
sm -- sign of number,
|
||||
int -- integer part of mantissa (without `0x' prefix),
|
||||
fract -- fractional part of mantissa (without decimal point and `L' or
|
||||
`LL' suffixes),
|
||||
se -- sign of exponent,
|
||||
exp -- absolute value of (binary) exponent.
|
||||
|
||||
Example:
|
||||
|
||||
double yhi = HEX_DBL(+, 1, 5555555555555, -, 2); // 0x1.5555555555555p-2
|
||||
|
||||
Note:
|
||||
|
||||
We have to pass signs as separate arguments because gcc pass negative
|
||||
integer values (e. g. `-2') into a macro as two separate tokens, so
|
||||
`HEX_FLT(1, 0, -2)' produces result `0x1.0p- 2' (note a space between minus
|
||||
and two) which is not a correct floating point literal.
|
||||
-----------------------------------------------------------------------------*/
|
||||
#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
|
||||
// If compiler does not support hex floating point literals:
|
||||
#define HEX_FLT(sm, int, fract, se, exp) \
|
||||
sm ldexpf((float)(0x##int##fract##UL), \
|
||||
se exp + ilogbf((float)0x##int) - ilogbf((float)(0x##int##fract##UL)))
|
||||
#define HEX_DBL(sm, int, fract, se, exp) \
|
||||
sm ldexp((double)(0x##int##fract##ULL), \
|
||||
se exp + ilogb((double)0x##int) - ilogb((double)(0x##int##fract##ULL)))
|
||||
#define HEX_LDBL(sm, int, fract, se, exp) \
|
||||
sm ldexpl((long double)(0x##int##fract##ULL), \
|
||||
se exp + ilogbl((long double)0x##int) - ilogbl((long double)(0x##int##fract##ULL)))
|
||||
#else
|
||||
// If compiler supports hex floating point literals: just concatenate all the
|
||||
// parts into a literal.
|
||||
#define HEX_FLT(sm, int, fract, se, exp) sm 0x##int##.##fract##p##se##exp##F
|
||||
#define HEX_DBL(sm, int, fract, se, exp) sm 0x##int##.##fract##p##se##exp
|
||||
#define HEX_LDBL(sm, int, fract, se, exp) sm 0x##int##.##fract##p##se##exp##L
|
||||
#endif
|
||||
|
||||
inline constexpr std::array kSpecialValuesDouble{
|
||||
-std::numeric_limits<double>::quiet_NaN(),
|
||||
-std::numeric_limits<double>::infinity(),
|
||||
-std::numeric_limits<double>::max(),
|
||||
HEX_DBL(-, 1, 0000000000001, +, 64),
|
||||
HEX_DBL(-, 1, 0, +, 64),
|
||||
HEX_DBL(-, 1, fffffffffffff, +, 63),
|
||||
HEX_DBL(-, 1, 0000000000001, +, 63),
|
||||
HEX_DBL(-, 1, 0, +, 63),
|
||||
HEX_DBL(-, 1, fffffffffffff, +, 62),
|
||||
HEX_DBL(-, 1, 000002, +, 32),
|
||||
HEX_DBL(-, 1, 0, +, 32),
|
||||
HEX_DBL(-, 1, fffffffffffff, +, 31),
|
||||
HEX_DBL(-, 1, 0000000000001, +, 31),
|
||||
HEX_DBL(-, 1, 0, +, 31),
|
||||
HEX_DBL(-, 1, fffffffffffff, +, 30),
|
||||
-1000.0,
|
||||
-100.0,
|
||||
-4.0,
|
||||
-3.5,
|
||||
-3.0,
|
||||
HEX_DBL(-, 1, 8000000000001, +, 1),
|
||||
-2.5,
|
||||
HEX_DBL(-, 1, 7ffffffffffff, +, 1),
|
||||
-2.0,
|
||||
HEX_DBL(-, 1, 8000000000001, +, 0),
|
||||
-1.5,
|
||||
HEX_DBL(-, 1, 7ffffffffffff, +, 0),
|
||||
HEX_DBL(-, 1, 0000000000001, +, 0),
|
||||
-1.0,
|
||||
HEX_DBL(-, 1, fffffffffffff, -, 1),
|
||||
HEX_DBL(-, 1, 0000000000001, -, 1),
|
||||
-0.5,
|
||||
HEX_DBL(-, 1, fffffffffffff, -, 2),
|
||||
HEX_DBL(-, 1, 0000000000001, -, 2),
|
||||
-0.25,
|
||||
HEX_DBL(-, 1, fffffffffffff, -, 3),
|
||||
HEX_DBL(-, 1, 0000000000001, -, 1022),
|
||||
-std::numeric_limits<double>::min(),
|
||||
HEX_DBL(-, 0, fffffffffffff, -, 1022),
|
||||
HEX_DBL(-, 0, 0000000000fff, -, 1022),
|
||||
HEX_DBL(-, 0, 00000000000fe, -, 1022),
|
||||
HEX_DBL(-, 0, 000000000000e, -, 1022),
|
||||
HEX_DBL(-, 0, 000000000000c, -, 1022),
|
||||
HEX_DBL(-, 0, 000000000000a, -, 1022),
|
||||
HEX_DBL(-, 0, 0000000000008, -, 1022),
|
||||
HEX_DBL(-, 0, 0000000000007, -, 1022),
|
||||
HEX_DBL(-, 0, 0000000000006, -, 1022),
|
||||
HEX_DBL(-, 0, 0000000000005, -, 1022),
|
||||
HEX_DBL(-, 0, 0000000000004, -, 1022),
|
||||
HEX_DBL(-, 0, 0000000000003, -, 1022),
|
||||
HEX_DBL(-, 0, 0000000000002, -, 1022),
|
||||
HEX_DBL(-, 0, 0000000000001, -, 1022),
|
||||
-0.0,
|
||||
|
||||
std::numeric_limits<double>::quiet_NaN(),
|
||||
std::numeric_limits<double>::infinity(),
|
||||
std::numeric_limits<double>::max(),
|
||||
HEX_DBL(+, 1, 0000000000001, +, 64),
|
||||
HEX_DBL(+, 1, 0, +, 64),
|
||||
HEX_DBL(+, 1, fffffffffffff, +, 63),
|
||||
HEX_DBL(+, 1, 0000000000001, +, 63),
|
||||
HEX_DBL(+, 1, 0, +, 63),
|
||||
HEX_DBL(+, 1, fffffffffffff, +, 62),
|
||||
HEX_DBL(+, 1, 000002, +, 32),
|
||||
HEX_DBL(+, 1, 0, +, 32),
|
||||
HEX_DBL(+, 1, fffffffffffff, +, 31),
|
||||
HEX_DBL(+, 1, 0000000000001, +, 31),
|
||||
HEX_DBL(+, 1, 0, +, 31),
|
||||
HEX_DBL(+, 1, fffffffffffff, +, 30),
|
||||
+1000.0,
|
||||
+100.0,
|
||||
+4.0,
|
||||
+3.5,
|
||||
+3.0,
|
||||
HEX_DBL(+, 1, 8000000000001, +, 1),
|
||||
+2.5,
|
||||
HEX_DBL(+, 1, 7ffffffffffff, +, 1),
|
||||
+2.0,
|
||||
HEX_DBL(+, 1, 8000000000001, +, 0),
|
||||
+1.5,
|
||||
HEX_DBL(+, 1, 7ffffffffffff, +, 0),
|
||||
HEX_DBL(+, 1, 0000000000001, +, 0),
|
||||
+1.0,
|
||||
HEX_DBL(+, 1, fffffffffffff, -, 1),
|
||||
HEX_DBL(+, 1, 0000000000001, -, 1),
|
||||
+0.5,
|
||||
HEX_DBL(+, 1, fffffffffffff, -, 2),
|
||||
HEX_DBL(+, 1, 0000000000001, -, 2),
|
||||
+0.25,
|
||||
HEX_DBL(+, 1, fffffffffffff, -, 3),
|
||||
HEX_DBL(+, 1, 0000000000001, -, 1022),
|
||||
+std::numeric_limits<double>::min(),
|
||||
HEX_DBL(+, 0, fffffffffffff, -, 1022),
|
||||
HEX_DBL(+, 0, 0000000000fff, -, 1022),
|
||||
HEX_DBL(+, 0, 00000000000fe, -, 1022),
|
||||
HEX_DBL(+, 0, 000000000000e, -, 1022),
|
||||
HEX_DBL(+, 0, 000000000000c, -, 1022),
|
||||
HEX_DBL(+, 0, 000000000000a, -, 1022),
|
||||
HEX_DBL(+, 0, 0000000000008, -, 1022),
|
||||
HEX_DBL(+, 0, 0000000000007, -, 1022),
|
||||
HEX_DBL(+, 0, 0000000000006, -, 1022),
|
||||
HEX_DBL(+, 0, 0000000000005, -, 1022),
|
||||
HEX_DBL(+, 0, 0000000000004, -, 1022),
|
||||
HEX_DBL(+, 0, 0000000000003, -, 1022),
|
||||
HEX_DBL(+, 0, 0000000000002, -, 1022),
|
||||
HEX_DBL(+, 0, 0000000000001, -, 1022),
|
||||
+0.0,
|
||||
};
|
||||
|
||||
inline constexpr std::array kSpecialValuesFloat{
|
||||
-std::numeric_limits<float>::quiet_NaN(),
|
||||
-std::numeric_limits<float>::infinity(),
|
||||
-std::numeric_limits<float>::max(),
|
||||
HEX_FLT(-, 1, 000002, +, 64),
|
||||
HEX_FLT(-, 1, 0, +, 64),
|
||||
HEX_FLT(-, 1, fffffe, +, 63),
|
||||
HEX_FLT(-, 1, 000002, +, 63),
|
||||
HEX_FLT(-, 1, 0, +, 63),
|
||||
HEX_FLT(-, 1, fffffe, +, 62),
|
||||
HEX_FLT(-, 1, 000002, +, 32),
|
||||
HEX_FLT(-, 1, 0, +, 32),
|
||||
HEX_FLT(-, 1, fffffe, +, 31),
|
||||
HEX_FLT(-, 1, 000002, +, 31),
|
||||
HEX_FLT(-, 1, 0, +, 31),
|
||||
HEX_FLT(-, 1, fffffe, +, 30),
|
||||
-1000.f,
|
||||
-100.f,
|
||||
-4.0f,
|
||||
-3.5f,
|
||||
-3.0f,
|
||||
HEX_FLT(-, 1, 800002, +, 1),
|
||||
-2.5f,
|
||||
HEX_FLT(-, 1, 7ffffe, +, 1),
|
||||
-2.0f,
|
||||
HEX_FLT(-, 1, 800002, +, 0),
|
||||
-1.5f,
|
||||
HEX_FLT(-, 1, 7ffffe, +, 0),
|
||||
HEX_FLT(-, 1, 000002, +, 0),
|
||||
-1.0f,
|
||||
HEX_FLT(-, 1, fffffe, -, 1),
|
||||
HEX_FLT(-, 1, 000002, -, 1),
|
||||
-0.5f,
|
||||
HEX_FLT(-, 1, fffffe, -, 2),
|
||||
HEX_FLT(-, 1, 000002, -, 2),
|
||||
-0.25f,
|
||||
HEX_FLT(-, 1, fffffe, -, 3),
|
||||
HEX_FLT(-, 1, 000002, -, 126),
|
||||
-std::numeric_limits<float>::min(),
|
||||
HEX_FLT(-, 0, fffffe, -, 126),
|
||||
HEX_FLT(-, 0, 000ffe, -, 126),
|
||||
HEX_FLT(-, 0, 0000fe, -, 126),
|
||||
HEX_FLT(-, 0, 00000e, -, 126),
|
||||
HEX_FLT(-, 0, 00000c, -, 126),
|
||||
HEX_FLT(-, 0, 00000a, -, 126),
|
||||
HEX_FLT(-, 0, 000008, -, 126),
|
||||
HEX_FLT(-, 0, 000006, -, 126),
|
||||
HEX_FLT(-, 0, 000004, -, 126),
|
||||
HEX_FLT(-, 0, 000002, -, 126),
|
||||
-0.0f,
|
||||
|
||||
std::numeric_limits<float>::quiet_NaN(),
|
||||
std::numeric_limits<float>::infinity(),
|
||||
std::numeric_limits<float>::max(),
|
||||
HEX_FLT(+, 1, 000002, +, 64),
|
||||
HEX_FLT(+, 1, 0, +, 64),
|
||||
HEX_FLT(+, 1, fffffe, +, 63),
|
||||
HEX_FLT(+, 1, 000002, +, 63),
|
||||
HEX_FLT(+, 1, 0, +, 63),
|
||||
HEX_FLT(+, 1, fffffe, +, 62),
|
||||
HEX_FLT(+, 1, 000002, +, 32),
|
||||
HEX_FLT(+, 1, 0, +, 32),
|
||||
HEX_FLT(+, 1, fffffe, +, 31),
|
||||
HEX_FLT(+, 1, 000002, +, 31),
|
||||
HEX_FLT(+, 1, 0, +, 31),
|
||||
HEX_FLT(+, 1, fffffe, +, 30),
|
||||
+1000.f,
|
||||
+100.f,
|
||||
+4.0f,
|
||||
+3.5f,
|
||||
+3.0f,
|
||||
HEX_FLT(+, 1, 800002, +, 1),
|
||||
2.5f,
|
||||
HEX_FLT(+, 1, 7ffffe, +, 1),
|
||||
+2.0f,
|
||||
HEX_FLT(+, 1, 800002, +, 0),
|
||||
1.5f,
|
||||
HEX_FLT(+, 1, 7ffffe, +, 0),
|
||||
HEX_FLT(+, 1, 000002, +, 0),
|
||||
+1.0f,
|
||||
HEX_FLT(+, 1, fffffe, -, 1),
|
||||
HEX_FLT(+, 1, 000002, -, 1),
|
||||
+0.5f,
|
||||
HEX_FLT(+, 1, fffffe, -, 2),
|
||||
HEX_FLT(+, 1, 000002, -, 2),
|
||||
+0.25f,
|
||||
HEX_FLT(+, 1, fffffe, -, 3),
|
||||
HEX_FLT(+, 1, 000002, -, 126),
|
||||
+std::numeric_limits<float>::min(),
|
||||
HEX_FLT(+, 0, fffffe, -, 126),
|
||||
HEX_FLT(+, 0, 000ffe, -, 126),
|
||||
HEX_FLT(+, 0, 0000fe, -, 126),
|
||||
HEX_FLT(+, 0, 00000e, -, 126),
|
||||
HEX_FLT(+, 0, 00000c, -, 126),
|
||||
HEX_FLT(+, 0, 00000a, -, 126),
|
||||
HEX_FLT(+, 0, 000008, -, 126),
|
||||
HEX_FLT(+, 0, 000006, -, 126),
|
||||
HEX_FLT(+, 0, 000004, -, 126),
|
||||
HEX_FLT(+, 0, 000002, -, 126),
|
||||
+0.0f,
|
||||
};
|
||||
|
||||
template <typename T> struct SpecialVals {
|
||||
const T* const data;
|
||||
const size_t size;
|
||||
};
|
||||
|
||||
inline constexpr auto kSpecialValRegistry =
|
||||
std::make_tuple(SpecialVals<float>{kSpecialValuesFloat.data(), kSpecialValuesFloat.size()},
|
||||
SpecialVals<double>{kSpecialValuesDouble.data(), kSpecialValuesDouble.size()});
|
||||
@@ -0,0 +1,246 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "math_common.hh"
|
||||
#include "math_special_values.hh"
|
||||
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
#define MATH_QUATERNARY_KERNEL_DEF(func_name) \
|
||||
template <typename T> \
|
||||
__global__ void func_name##_kernel(T* const ys, const size_t num_xs, T* const x1s, T* const x2s, \
|
||||
T* const x3s, T* const x4s) { \
|
||||
const auto tid = cg::this_grid().thread_rank(); \
|
||||
const auto stride = cg::this_grid().size(); \
|
||||
\
|
||||
for (auto i = tid; i < num_xs; i += stride) { \
|
||||
if constexpr (std::is_same_v<float, T>) { \
|
||||
ys[i] = func_name##f(x1s[i], x2s[i], x3s[i], x4s[i]); \
|
||||
} else if constexpr (std::is_same_v<double, T>) { \
|
||||
ys[i] = func_name(x1s[i], x2s[i], x3s[i], x4s[i]); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
inline constexpr std::array kSpecialValuesReducedDouble{
|
||||
-std::numeric_limits<double>::quiet_NaN(),
|
||||
-std::numeric_limits<double>::infinity(),
|
||||
-std::numeric_limits<double>::max(),
|
||||
HEX_DBL(-, 1, 0000000000001, +, 64),
|
||||
HEX_DBL(-, 1, fffffffffffff, +, 63),
|
||||
HEX_DBL(-, 1, fffffffffffff, +, 62),
|
||||
HEX_DBL(-, 1, 0, +, 32),
|
||||
HEX_DBL(-, 1, 0000000000001, +, 31),
|
||||
HEX_DBL(-, 1, fffffffffffff, +, 30),
|
||||
-1000.0,
|
||||
-3.5,
|
||||
HEX_DBL(-, 1, 8000000000001, +, 1),
|
||||
-2.5,
|
||||
HEX_DBL(-, 1, 8000000000001, +, 0),
|
||||
-1.5,
|
||||
-0.5,
|
||||
-0.25,
|
||||
HEX_DBL(-, 1, fffffffffffff, -, 3),
|
||||
-std::numeric_limits<double>::min(),
|
||||
HEX_DBL(-, 0, fffffffffffff, -, 1022),
|
||||
HEX_DBL(-, 0, 0000000000001, -, 1022),
|
||||
-0.0,
|
||||
|
||||
std::numeric_limits<double>::quiet_NaN(),
|
||||
std::numeric_limits<double>::infinity(),
|
||||
std::numeric_limits<double>::max(),
|
||||
HEX_DBL(+, 1, 0, +, 64),
|
||||
HEX_DBL(+, 1, 0000000000001, +, 63),
|
||||
HEX_DBL(+, 1, 000002, +, 32),
|
||||
HEX_DBL(+, 1, fffffffffffff, +, 31),
|
||||
HEX_DBL(+, 1, 0, +, 31),
|
||||
HEX_DBL(+, 1, fffffffffffff, +, 30),
|
||||
+100.0,
|
||||
+3.0,
|
||||
HEX_DBL(+, 1, 7ffffffffffff, +, 1),
|
||||
+2.0,
|
||||
HEX_DBL(+, 1, 7ffffffffffff, +, 0),
|
||||
+1.0,
|
||||
HEX_DBL(+, 1, fffffffffffff, -, 2),
|
||||
+std::numeric_limits<double>::min(),
|
||||
HEX_DBL(+, 0, 0000000000fff, -, 1022),
|
||||
HEX_DBL(+, 0, 0000000000007, -, 1022),
|
||||
+0.0,
|
||||
};
|
||||
|
||||
inline constexpr std::array kSpecialValuesReducedFloat{
|
||||
-std::numeric_limits<float>::quiet_NaN(),
|
||||
-std::numeric_limits<float>::infinity(),
|
||||
-std::numeric_limits<float>::max(),
|
||||
HEX_FLT(-, 1, 000002, +, 64),
|
||||
HEX_FLT(-, 1, fffffe, +, 63),
|
||||
HEX_FLT(-, 1, fffffe, +, 62),
|
||||
HEX_FLT(-, 1, 0, +, 32),
|
||||
HEX_FLT(-, 1, fffffe, +, 31),
|
||||
HEX_FLT(-, 1, fffffe, +, 30),
|
||||
-1000.f,
|
||||
-3.5f,
|
||||
HEX_FLT(-, 1, 800002, +, 1),
|
||||
-2.5f,
|
||||
HEX_FLT(-, 1, 800002, +, 0),
|
||||
-1.5f,
|
||||
-0.5f,
|
||||
-0.25f,
|
||||
HEX_FLT(-, 1, fffffe, -, 3),
|
||||
-std::numeric_limits<float>::min(),
|
||||
HEX_FLT(-, 0, fffffe, -, 126),
|
||||
HEX_FLT(-, 0, 000002, -, 126),
|
||||
-0.0f,
|
||||
|
||||
std::numeric_limits<float>::quiet_NaN(),
|
||||
std::numeric_limits<float>::infinity(),
|
||||
std::numeric_limits<float>::max(),
|
||||
HEX_FLT(+, 1, 0, +, 64),
|
||||
HEX_FLT(+, 1, 000002, +, 63),
|
||||
HEX_FLT(+, 1, 000002, +, 32),
|
||||
HEX_FLT(+, 1, 000002, +, 31),
|
||||
HEX_FLT(+, 1, fffffe, +, 30),
|
||||
+100.f,
|
||||
+4.0f,
|
||||
HEX_FLT(+, 1, 7ffffe, +, 1),
|
||||
+2.0f,
|
||||
HEX_FLT(+, 1, 7ffffe, +, 0),
|
||||
+1.0f,
|
||||
HEX_FLT(+, 1, fffffe, -, 2),
|
||||
+std::numeric_limits<float>::min(),
|
||||
HEX_FLT(+, 0, 000ffe, -, 126),
|
||||
HEX_FLT(+, 0, 000006, -, 126),
|
||||
+0.0f,
|
||||
};
|
||||
|
||||
inline constexpr auto kSpecialValReducedRegistry = std::make_tuple(
|
||||
SpecialVals<float>{kSpecialValuesReducedFloat.data(), kSpecialValuesReducedFloat.size()},
|
||||
SpecialVals<double>{kSpecialValuesReducedDouble.data(), kSpecialValuesReducedDouble.size()});
|
||||
|
||||
template <typename T, typename TArg, typename RT, typename RTArg, typename ValidatorBuilder>
|
||||
void QuaternaryFloatingPointBruteForceTest(kernel_sig<T, TArg, TArg, TArg, TArg> kernel,
|
||||
ref_sig<RT, RTArg, RTArg, RTArg, RTArg> ref_func,
|
||||
const ValidatorBuilder& validator_builder,
|
||||
const TArg a = std::numeric_limits<TArg>::lowest(),
|
||||
const TArg b = std::numeric_limits<TArg>::max()) {
|
||||
const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
|
||||
const uint64_t num_iterations = GetTestIterationCount();
|
||||
const auto max_batch_size =
|
||||
std::min(GetMaxAllowedDeviceMemoryUsage() / (sizeof(TArg) * 4 + sizeof(T)), num_iterations);
|
||||
LinearAllocGuard<TArg> x1s{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(TArg)};
|
||||
LinearAllocGuard<TArg> x2s{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(TArg)};
|
||||
LinearAllocGuard<TArg> x3s{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(TArg)};
|
||||
LinearAllocGuard<TArg> x4s{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(TArg)};
|
||||
|
||||
MathTest math_test(kernel, max_batch_size);
|
||||
|
||||
auto batch_size = max_batch_size;
|
||||
const auto num_threads = thread_pool.thread_count();
|
||||
for (uint64_t i = 0ul; i < num_iterations; i += batch_size) {
|
||||
batch_size = std::min<uint64_t>(max_batch_size, num_iterations - i);
|
||||
|
||||
const auto min_sub_batch_size = batch_size / num_threads;
|
||||
const auto tail = batch_size % num_threads;
|
||||
|
||||
auto base_idx = 0u;
|
||||
for (auto i = 0u; i < num_threads; ++i) {
|
||||
const auto sub_batch_size = min_sub_batch_size + (i < tail);
|
||||
thread_pool.Post([=, &x1s, &x2s, &x3s, &x4s] {
|
||||
const auto generator = [=] {
|
||||
static thread_local std::mt19937 rng(std::random_device{}());
|
||||
std::uniform_real_distribution<RefType_t<TArg>> unif_dist(a, b);
|
||||
return static_cast<TArg>(unif_dist(rng));
|
||||
};
|
||||
std::generate(x1s.ptr() + base_idx, x1s.ptr() + base_idx + sub_batch_size, generator);
|
||||
std::generate(x2s.ptr() + base_idx, x2s.ptr() + base_idx + sub_batch_size, generator);
|
||||
std::generate(x3s.ptr() + base_idx, x3s.ptr() + base_idx + sub_batch_size, generator);
|
||||
std::generate(x4s.ptr() + base_idx, x4s.ptr() + base_idx + sub_batch_size, generator);
|
||||
});
|
||||
base_idx += sub_batch_size;
|
||||
}
|
||||
|
||||
thread_pool.Wait();
|
||||
|
||||
math_test.Run(validator_builder, grid_size, block_size, ref_func, batch_size, x1s.ptr(),
|
||||
x2s.ptr(), x3s.ptr(), x4s.ptr());
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename TArg, typename RT, typename RTArg, typename ValidatorBuilder>
|
||||
void QuaternaryFloatingPointSpecialValuesTest(kernel_sig<T, TArg, TArg, TArg, TArg> kernel,
|
||||
ref_sig<RT, RTArg, RTArg, RTArg, RTArg> ref_func,
|
||||
const ValidatorBuilder& validator_builder) {
|
||||
const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
|
||||
const auto values = std::get<SpecialVals<TArg>>(kSpecialValReducedRegistry);
|
||||
|
||||
const auto size = values.size * values.size * values.size * values.size;
|
||||
LinearAllocGuard<TArg> x1s{LinearAllocs::hipHostMalloc, size * sizeof(TArg)};
|
||||
LinearAllocGuard<TArg> x2s{LinearAllocs::hipHostMalloc, size * sizeof(TArg)};
|
||||
LinearAllocGuard<TArg> x3s{LinearAllocs::hipHostMalloc, size * sizeof(TArg)};
|
||||
LinearAllocGuard<TArg> x4s{LinearAllocs::hipHostMalloc, size * sizeof(TArg)};
|
||||
|
||||
for (auto i = 0u; i < values.size; ++i) {
|
||||
for (auto j = 0u; j < values.size; ++j) {
|
||||
for (auto k = 0u; k < values.size; ++k) {
|
||||
for (auto l = 0u; l < values.size; ++l) {
|
||||
x1s.ptr()[((i * values.size + j) * values.size + k) * values.size + l] = values.data[i];
|
||||
x2s.ptr()[((i * values.size + j) * values.size + k) * values.size + l] = values.data[j];
|
||||
x3s.ptr()[((i * values.size + j) * values.size + k) * values.size + l] = values.data[k];
|
||||
x4s.ptr()[((i * values.size + j) * values.size + k) * values.size + l] = values.data[l];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
MathTest math_test(kernel, size);
|
||||
math_test.template Run<false>(validator_builder, grid_size, block_size, ref_func, size, x1s.ptr(),
|
||||
x2s.ptr(), x3s.ptr(), x4s.ptr());
|
||||
}
|
||||
|
||||
template <typename T, typename TArg, typename RT, typename RTArg, typename ValidatorBuilder>
|
||||
void QuaternaryFloatingPointTest(kernel_sig<T, TArg, TArg, TArg, TArg> kernel,
|
||||
ref_sig<RT, RTArg, RTArg, RTArg, RTArg> ref_func,
|
||||
const ValidatorBuilder& validator_builder) {
|
||||
SECTION("Special values") {
|
||||
QuaternaryFloatingPointSpecialValuesTest(kernel, ref_func, validator_builder);
|
||||
}
|
||||
|
||||
SECTION("Brute force") {
|
||||
QuaternaryFloatingPointBruteForceTest(kernel, ref_func, validator_builder);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#define MATH_QUATERNARY_WITHIN_ULP_TEST_DEF(kern_name, ref_func, sp_ulp, dp_ulp) \
|
||||
MATH_QUATERNARY_KERNEL_DEF(kern_name) \
|
||||
\
|
||||
TEMPLATE_TEST_CASE("Unit_Device_" #kern_name "_Accuracy_Positive", "", float, double) { \
|
||||
using RT = RefType_t<TestType>; \
|
||||
RT (*ref)(RT, RT, RT, RT) = ref_func; \
|
||||
const auto ulp = std::is_same_v<float, TestType> ? sp_ulp : dp_ulp; \
|
||||
\
|
||||
QuaternaryFloatingPointTest(kern_name##_kernel<TestType>, ref, \
|
||||
ULPValidatorBuilderFactory<TestType>(ulp)); \
|
||||
}
|
||||
@@ -0,0 +1,142 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "math_common.hh"
|
||||
#include "math_special_values.hh"
|
||||
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
#define MATH_TERNARY_KERNEL_DEF(func_name) \
|
||||
template <typename T> \
|
||||
__global__ void func_name##_kernel(T* const ys, const size_t num_xs, T* const x1s, T* const x2s, \
|
||||
T* const x3s) { \
|
||||
const auto tid = cg::this_grid().thread_rank(); \
|
||||
const auto stride = cg::this_grid().size(); \
|
||||
\
|
||||
for (auto i = tid; i < num_xs; i += stride) { \
|
||||
if constexpr (std::is_same_v<float, T>) { \
|
||||
ys[i] = func_name##f(x1s[i], x2s[i], x3s[i]); \
|
||||
} else if constexpr (std::is_same_v<double, T>) { \
|
||||
ys[i] = func_name(x1s[i], x2s[i], x3s[i]); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
template <typename T, typename TArg, typename RT, typename RTArg, typename ValidatorBuilder>
|
||||
void TernaryFloatingPointBruteForceTest(kernel_sig<T, TArg, TArg, TArg> kernel,
|
||||
ref_sig<RT, RTArg, RTArg, RTArg> ref_func,
|
||||
const ValidatorBuilder& validator_builder,
|
||||
const TArg a = std::numeric_limits<TArg>::lowest(),
|
||||
const TArg b = std::numeric_limits<TArg>::max()) {
|
||||
const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
|
||||
const uint64_t num_iterations = GetTestIterationCount();
|
||||
const auto max_batch_size =
|
||||
std::min(GetMaxAllowedDeviceMemoryUsage() / (sizeof(TArg) * 3 + sizeof(T)), num_iterations);
|
||||
LinearAllocGuard<TArg> x1s{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(TArg)};
|
||||
LinearAllocGuard<TArg> x2s{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(TArg)};
|
||||
LinearAllocGuard<TArg> x3s{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(TArg)};
|
||||
|
||||
MathTest math_test(kernel, max_batch_size);
|
||||
|
||||
auto batch_size = max_batch_size;
|
||||
const auto num_threads = thread_pool.thread_count();
|
||||
for (uint64_t i = 0ul; i < num_iterations; i += batch_size) {
|
||||
batch_size = std::min<uint64_t>(max_batch_size, num_iterations - i);
|
||||
|
||||
const auto min_sub_batch_size = batch_size / num_threads;
|
||||
const auto tail = batch_size % num_threads;
|
||||
|
||||
auto base_idx = 0u;
|
||||
for (auto i = 0u; i < num_threads; ++i) {
|
||||
const auto sub_batch_size = min_sub_batch_size + (i < tail);
|
||||
thread_pool.Post([=, &x1s, &x2s, &x3s] {
|
||||
const auto generator = [=] {
|
||||
static thread_local std::mt19937 rng(std::random_device{}());
|
||||
std::uniform_real_distribution<RefType_t<TArg>> unif_dist(a, b);
|
||||
return static_cast<TArg>(unif_dist(rng));
|
||||
};
|
||||
std::generate(x1s.ptr() + base_idx, x1s.ptr() + base_idx + sub_batch_size, generator);
|
||||
std::generate(x2s.ptr() + base_idx, x2s.ptr() + base_idx + sub_batch_size, generator);
|
||||
std::generate(x3s.ptr() + base_idx, x3s.ptr() + base_idx + sub_batch_size, generator);
|
||||
});
|
||||
base_idx += sub_batch_size;
|
||||
}
|
||||
|
||||
thread_pool.Wait();
|
||||
|
||||
math_test.Run(validator_builder, grid_size, block_size, ref_func, batch_size, x1s.ptr(),
|
||||
x2s.ptr(), x3s.ptr());
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename TArg, typename RT, typename RTArg, typename ValidatorBuilder>
|
||||
void TernaryFloatingPointSpecialValuesTest(kernel_sig<T, TArg, TArg, TArg> kernel,
|
||||
ref_sig<RT, RTArg, RTArg, RTArg> ref_func,
|
||||
const ValidatorBuilder& validator_builder) {
|
||||
const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
|
||||
const auto values = std::get<SpecialVals<TArg>>(kSpecialValRegistry);
|
||||
|
||||
const auto size = values.size * values.size * values.size;
|
||||
LinearAllocGuard<TArg> x1s{LinearAllocs::hipHostMalloc, size * sizeof(TArg)};
|
||||
LinearAllocGuard<TArg> x2s{LinearAllocs::hipHostMalloc, size * sizeof(TArg)};
|
||||
LinearAllocGuard<TArg> x3s{LinearAllocs::hipHostMalloc, size * sizeof(TArg)};
|
||||
|
||||
for (auto i = 0u; i < values.size; ++i) {
|
||||
for (auto j = 0u; j < values.size; ++j) {
|
||||
for (auto k = 0u; k < values.size; ++k) {
|
||||
x1s.ptr()[(i * values.size + j) * values.size + k] = values.data[i];
|
||||
x2s.ptr()[(i * values.size + j) * values.size + k] = values.data[j];
|
||||
x3s.ptr()[(i * values.size + j) * values.size + k] = values.data[k];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
MathTest math_test(kernel, size);
|
||||
math_test.template Run<false>(validator_builder, grid_size, block_size, ref_func, size, x1s.ptr(),
|
||||
x2s.ptr(), x3s.ptr());
|
||||
}
|
||||
|
||||
template <typename T, typename TArg, typename RT, typename RTArg, typename ValidatorBuilder>
|
||||
void TernaryFloatingPointTest(kernel_sig<T, TArg, TArg, TArg> kernel, ref_sig<RT, RTArg, RTArg, RTArg> ref_func,
|
||||
const ValidatorBuilder& validator_builder) {
|
||||
SECTION("Special values") {
|
||||
TernaryFloatingPointSpecialValuesTest(kernel, ref_func, validator_builder);
|
||||
}
|
||||
|
||||
SECTION("Brute force") { TernaryFloatingPointBruteForceTest(kernel, ref_func, validator_builder); }
|
||||
}
|
||||
|
||||
|
||||
#define MATH_TERNARY_WITHIN_ULP_TEST_DEF(kern_name, ref_func, sp_ulp, dp_ulp) \
|
||||
MATH_TERNARY_KERNEL_DEF(kern_name) \
|
||||
\
|
||||
TEMPLATE_TEST_CASE("Unit_Device_" #kern_name "_Accuracy_Positive", "", float, double) { \
|
||||
using RT = RefType_t<TestType>; \
|
||||
RT (*ref)(RT, RT, RT) = ref_func; \
|
||||
const auto ulp = std::is_same_v<float, TestType> ? sp_ulp : dp_ulp; \
|
||||
\
|
||||
TernaryFloatingPointTest(kern_name##_kernel<TestType>, ref, \
|
||||
ULPValidatorBuilderFactory<TestType>(ulp)); \
|
||||
}
|
||||
@@ -0,0 +1,64 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <atomic>
|
||||
#include <thread>
|
||||
|
||||
#include <boost/asio/post.hpp>
|
||||
#include <boost/asio/thread_pool.hpp>
|
||||
|
||||
// This is a simple wrapper around boost::asio::thread_pool that keeps track of the number of
|
||||
// currently active tasks using an atomic counter.
|
||||
class ThreadPool {
|
||||
public:
|
||||
ThreadPool(size_t thread_count = std::thread::hardware_concurrency())
|
||||
: thread_count_(thread_count) {}
|
||||
|
||||
~ThreadPool() { thread_pool_.join(); }
|
||||
|
||||
// Submits a task to the thread pool and increments the number of active tasks. The task is
|
||||
// wrapped in a lambda that decrements the number of active tasks upon completion.
|
||||
template <typename T> void Post(T&& task) {
|
||||
++active_tasks_;
|
||||
auto&& task_wrapper = [task, this] {
|
||||
task();
|
||||
--active_tasks_;
|
||||
};
|
||||
boost::asio::post(thread_pool_, task_wrapper);
|
||||
}
|
||||
|
||||
// Busy waits for the number of active tasks to reach zero.
|
||||
void Wait() const {
|
||||
while (active_tasks_.load(std::memory_order_relaxed))
|
||||
;
|
||||
}
|
||||
|
||||
size_t thread_count() const { return thread_count_; }
|
||||
|
||||
private:
|
||||
const size_t thread_count_;
|
||||
boost::asio::thread_pool thread_pool_{thread_count_};
|
||||
std::atomic<size_t> active_tasks_;
|
||||
};
|
||||
|
||||
inline ThreadPool thread_pool{};
|
||||
@@ -0,0 +1,108 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
|
||||
class Dummy {
|
||||
public:
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
|
||||
#define TRIG_DP_UNARY_NEGATIVE_KERNELS(func_name) \
|
||||
__global__ void func_name##_kernel_v1(double* x) { double result = func_name(x); } \
|
||||
__global__ void func_name##_kernel_v2(Dummy x) { double result = func_name(x); }
|
||||
|
||||
/*Expecting 2 errors per macro invocation - 26 total*/
|
||||
TRIG_DP_UNARY_NEGATIVE_KERNELS(sin)
|
||||
TRIG_DP_UNARY_NEGATIVE_KERNELS(cos)
|
||||
TRIG_DP_UNARY_NEGATIVE_KERNELS(tan)
|
||||
TRIG_DP_UNARY_NEGATIVE_KERNELS(asin)
|
||||
TRIG_DP_UNARY_NEGATIVE_KERNELS(acos)
|
||||
TRIG_DP_UNARY_NEGATIVE_KERNELS(atan)
|
||||
TRIG_DP_UNARY_NEGATIVE_KERNELS(sinh)
|
||||
TRIG_DP_UNARY_NEGATIVE_KERNELS(cosh)
|
||||
TRIG_DP_UNARY_NEGATIVE_KERNELS(tanh)
|
||||
TRIG_DP_UNARY_NEGATIVE_KERNELS(asinh)
|
||||
TRIG_DP_UNARY_NEGATIVE_KERNELS(atanh)
|
||||
TRIG_DP_UNARY_NEGATIVE_KERNELS(sinpi)
|
||||
TRIG_DP_UNARY_NEGATIVE_KERNELS(cospi)
|
||||
|
||||
/*Expecting 4 errors*/
|
||||
__global__ void atan2_kernel_v1(double* x, double y) { double result = atan2(x, y); }
|
||||
__global__ void atan2_kernel_v2(double x, double* y) { double result = atan2(x, y); }
|
||||
__global__ void atan2_kernel_v3(Dummy x, double y) { double result = atan2(x, y); }
|
||||
__global__ void atan2_kernel_v4(double x, Dummy y) { double result = atan2(x, y); }
|
||||
|
||||
/*Expecting 18 errors*/
|
||||
__global__ void sincos_kernel_v1(double* x, double* sptr, double* cptr) { sincos(x, sptr, cptr); }
|
||||
__global__ void sincos_kernel_v2(Dummy x, double* sptr, double* cptr) { sincos(x, sptr, cptr); }
|
||||
__global__ void sincos_kernel_v3(double x, char* sptr, double* cptr) { sincos(x, sptr, cptr); }
|
||||
__global__ void sincos_kernel_v4(double x, short* sptr, double* cptr) { sincos(x, sptr, cptr); }
|
||||
__global__ void sincos_kernel_v5(double x, int* sptr, double* cptr) { sincos(x, sptr, cptr); }
|
||||
__global__ void sincos_kernel_v6(double x, long* sptr, double* cptr) { sincos(x, sptr, cptr); }
|
||||
__global__ void sincos_kernel_v7(double x, long long* sptr, double* cptr) { sincos(x, sptr, cptr); }
|
||||
__global__ void sincos_kernel_v8(double x, float* sptr, double* cptr) { sincos(x, sptr, cptr); }
|
||||
__global__ void sincos_kernel_v9(double x, Dummy* sptr, double* cptr) { sincos(x, sptr, cptr); }
|
||||
__global__ void sincos_kernel_v10(double x, const double* sptr, double* cptr) {
|
||||
sincos(x, sptr, cptr);
|
||||
}
|
||||
__global__ void sincos_kernel_v11(double x, double* sptr, char* cptr) { sincos(x, sptr, cptr); }
|
||||
__global__ void sincos_kernel_v12(double x, double* sptr, short* cptr) { sincos(x, sptr, cptr); }
|
||||
__global__ void sincos_kernel_v13(double x, double* sptr, int* cptr) { sincos(x, sptr, cptr); }
|
||||
__global__ void sincos_kernel_v14(double x, double* sptr, long* cptr) { sincos(x, sptr, cptr); }
|
||||
__global__ void sincos_kernel_v15(double x, double* sptr, long long* cptr) {
|
||||
sincos(x, sptr, cptr);
|
||||
}
|
||||
__global__ void sincos_kernel_v16(double x, double* sptr, float* cptr) { sincos(x, sptr, cptr); }
|
||||
__global__ void sincos_kernel_v17(double x, double* sptr, Dummy* cptr) { sincos(x, sptr, cptr); }
|
||||
__global__ void sincos_kernel_v18(double x, double* sptr, const double* cptr) {
|
||||
sincos(x, sptr, cptr);
|
||||
}
|
||||
|
||||
/*Expecting 18 errors*/
|
||||
__global__ void sincospi_kernel_v1(float* x, float* sptr, float* cptr) { sincospi(x, sptr, cptr); }
|
||||
__global__ void sincospi_kernel_v2(Dummy x, float* sptr, float* cptr) { sincospi(x, sptr, cptr); }
|
||||
__global__ void sincospi_kernel_v3(float x, char* sptr, float* cptr) { sincospi(x, sptr, cptr); }
|
||||
__global__ void sincospi_kernel_v4(float x, short* sptr, float* cptr) { sincospi(x, sptr, cptr); }
|
||||
__global__ void sincospi_kernel_v5(float x, int* sptr, float* cptr) { sincospi(x, sptr, cptr); }
|
||||
__global__ void sincospi_kernel_v6(float x, long* sptr, float* cptr) { sincospi(x, sptr, cptr); }
|
||||
__global__ void sincospi_kernel_v7(float x, long long* sptr, float* cptr) {
|
||||
sincospi(x, sptr, cptr);
|
||||
}
|
||||
__global__ void sincospi_kernel_v8(float x, double* sptr, float* cptr) { sincospi(x, sptr, cptr); }
|
||||
__global__ void sincospi_kernel_v9(float x, Dummy* sptr, float* cptr) { sincospi(x, sptr, cptr); }
|
||||
__global__ void sincospi_kernel_v10(float x, const float* sptr, float* cptr) {
|
||||
sincospi(x, sptr, cptr);
|
||||
}
|
||||
__global__ void sincospi_kernel_v11(float x, float* sptr, char* cptr) { sincospi(x, sptr, cptr); }
|
||||
__global__ void sincospi_kernel_v12(float x, float* sptr, short* cptr) { sincospi(x, sptr, cptr); }
|
||||
__global__ void sincospi_kernel_v13(float x, float* sptr, int* cptr) { sincospi(x, sptr, cptr); }
|
||||
__global__ void sincospi_kernel_v14(float x, float* sptr, long* cptr) { sincospi(x, sptr, cptr); }
|
||||
__global__ void sincospi_kernel_v15(float x, float* sptr, long long* cptr) {
|
||||
sincospi(x, sptr, cptr);
|
||||
}
|
||||
__global__ void sincospi_kernel_v16(float x, float* sptr, double* cptr) { sincospi(x, sptr, cptr); }
|
||||
__global__ void sincospi_kernel_v17(float x, float* sptr, Dummy* cptr) { sincospi(x, sptr, cptr); }
|
||||
__global__ void sincospi_kernel_v18(float x, float* sptr, const float* cptr) {
|
||||
sincospi(x, sptr, cptr);
|
||||
}
|
||||
@@ -0,0 +1,137 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "trig_negative_kernels_rtc.hh"
|
||||
|
||||
#include "unary_common.hh"
|
||||
#include "binary_common.hh"
|
||||
|
||||
#include <boost/math/special_functions.hpp>
|
||||
|
||||
|
||||
MATH_UNARY_WITHIN_ULP_TEST_DEF(sin, std::sin, 2, 2);
|
||||
TEST_CASE("Unit_Device_sin_sinf_Negative_RTC") { NegativeTestRTCWrapper<4>(kSin); }
|
||||
|
||||
MATH_UNARY_WITHIN_ULP_TEST_DEF(cos, std::cos, 2, 2)
|
||||
TEST_CASE("Unit_Device_cos_cosf_Negative_RTC") { NegativeTestRTCWrapper<4>(kCos); }
|
||||
|
||||
MATH_UNARY_WITHIN_ULP_TEST_DEF(tan, std::tan, 4, 2)
|
||||
TEST_CASE("Unit_Device_tan_tanf_Negative_RTC") { NegativeTestRTCWrapper<4>(kTan); }
|
||||
|
||||
MATH_UNARY_WITHIN_ULP_TEST_DEF(asin, std::asin, 2, 2)
|
||||
TEST_CASE("Unit_Device_asin_asinf_Negative_RTC") { NegativeTestRTCWrapper<4>(kAsin); }
|
||||
|
||||
MATH_UNARY_WITHIN_ULP_TEST_DEF(acos, std::acos, 2, 2)
|
||||
TEST_CASE("Unit_Device_acos_acosf_Negative_RTC") { NegativeTestRTCWrapper<4>(kAcos); }
|
||||
|
||||
MATH_UNARY_WITHIN_ULP_TEST_DEF(atan, std::atan, 2, 2)
|
||||
TEST_CASE("Unit_Device_atan_atanf_Negative_RTC") { NegativeTestRTCWrapper<4>(kAtan); }
|
||||
|
||||
MATH_UNARY_WITHIN_ULP_TEST_DEF(sinh, std::sinh, 3, 2)
|
||||
TEST_CASE("Unit_Device_sinh_sinhf_Negative_RTC") { NegativeTestRTCWrapper<4>(kSinh); }
|
||||
|
||||
MATH_UNARY_WITHIN_ULP_TEST_DEF(cosh, std::cosh, 2, 1)
|
||||
TEST_CASE("Unit_Device_cosh_coshf_Negative_RTC") { NegativeTestRTCWrapper<4>(kCosh); }
|
||||
|
||||
MATH_UNARY_WITHIN_ULP_TEST_DEF(tanh, std::tanh, 2, 1)
|
||||
TEST_CASE("Unit_Device_tanh_tanhf_Negative_RTC") { NegativeTestRTCWrapper<4>(kTanh); }
|
||||
|
||||
MATH_UNARY_WITHIN_ULP_TEST_DEF(asinh, std::asinh, 3, 2)
|
||||
TEST_CASE("Unit_Device_asinh_asinhf_Negative_RTC") { NegativeTestRTCWrapper<4>(kAsinh); }
|
||||
|
||||
MATH_UNARY_WITHIN_ULP_TEST_DEF(acosh, std::acosh, 4, 2)
|
||||
TEST_CASE("Unit_Device_acosh_acoshf_Negative_RTC") { NegativeTestRTCWrapper<4>(kAcosh); }
|
||||
|
||||
MATH_UNARY_WITHIN_ULP_TEST_DEF(atanh, std::atanh, 3, 2)
|
||||
TEST_CASE("Unit_Device_atanh_atanhf_Negative_RTC") { NegativeTestRTCWrapper<4>(kAtanh); }
|
||||
|
||||
MATH_UNARY_WITHIN_ULP_TEST_DEF(sinpi, boost::math::sin_pi, 2, 2);
|
||||
TEST_CASE("Unit_Device_sinpi_sinpif_Negative_RTC") { NegativeTestRTCWrapper<4>(kSinpi); }
|
||||
|
||||
MATH_UNARY_WITHIN_ULP_TEST_DEF(cospi, boost::math::cos_pi, 2, 2);
|
||||
TEST_CASE("Unit_Device_cospi_cospif_Negative_RTC") { NegativeTestRTCWrapper<4>(kCospi); }
|
||||
|
||||
MATH_BINARY_WITHIN_ULP_TEST_DEF(atan2, std::atan2, 3, 2);
|
||||
TEST_CASE("Unit_Device_atan2_atan2f_Negative_RTC") { NegativeTestRTCWrapper<8>(kAtan2); }
|
||||
|
||||
|
||||
template <typename T>
|
||||
__global__ void sincos_kernel(std::pair<T, T>* const ys, const size_t num_xs, T* const xs) {
|
||||
const auto tid = cg::this_grid().thread_rank();
|
||||
const auto stride = cg::this_grid().size();
|
||||
|
||||
for (auto i = tid; i < num_xs; i += stride) {
|
||||
if constexpr (std::is_same_v<float, T>) {
|
||||
sincosf(xs[i], &ys[i].first, &ys[i].second);
|
||||
} else if constexpr (std::is_same_v<double, T>) {
|
||||
sincos(xs[i], &ys[i].first, &ys[i].second);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T> std::pair<T, T> sincos(T x) { return {std::sin(x), std::cos(x)}; }
|
||||
|
||||
TEST_CASE("Unit_Device_sincos_Accuracy_Positive - float") {
|
||||
UnarySinglePrecisionTest(
|
||||
sincos_kernel<float>, sincos<double>,
|
||||
PairValidatorBuilderFactory<float>(ULPValidatorBuilderFactory<float>(2)));
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_Device_sincos_Accuracy_Positive - double") {
|
||||
const auto validator_builder =
|
||||
PairValidatorBuilderFactory<double>(ULPValidatorBuilderFactory<double>(2));
|
||||
UnaryDoublePrecisionTest(sincos_kernel<double>, sincos<long double>, validator_builder);
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_Device_sincos_sincosf_Negative_RTC") { NegativeTestRTCWrapper<36>(kSincos); }
|
||||
|
||||
|
||||
template <typename T>
|
||||
__global__ void sincospi_kernel(std::pair<T, T>* const ys, const size_t num_xs, T* const xs) {
|
||||
const auto tid = cg::this_grid().thread_rank();
|
||||
const auto stride = cg::this_grid().size();
|
||||
|
||||
for (auto i = tid; i < num_xs; i += stride) {
|
||||
if constexpr (std::is_same_v<float, T>) {
|
||||
sincospif(xs[i], &ys[i].first, &ys[i].second);
|
||||
} else if constexpr (std::is_same_v<double, T>) {
|
||||
sincospi(xs[i], &ys[i].first, &ys[i].second);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T> std::pair<T, T> sincospi(T x) {
|
||||
return {boost::math::sin_pi(x), boost::math::cos_pi(x)};
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_Device_sincospi_Accuracy_Positive - float") {
|
||||
UnarySinglePrecisionTest(
|
||||
sincospi_kernel<float>, sincospi<double>,
|
||||
PairValidatorBuilderFactory<float>(ULPValidatorBuilderFactory<float>(2)));
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_Device_sincospi_Accuracy_Positive - double") {
|
||||
const auto validator_builder =
|
||||
PairValidatorBuilderFactory<double>(ULPValidatorBuilderFactory<double>(2));
|
||||
UnaryDoublePrecisionTest(sincospi_kernel<double>, sincospi<long double>, validator_builder);
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_Device_sincospi_sincospif_Negative_RTC") { NegativeTestRTCWrapper<36>(kSincospi); }
|
||||
@@ -0,0 +1,320 @@
|
||||
// #define TRIG_UNARY_NEGATIVE_KERNELS(func_name)
|
||||
// class Dummy {
|
||||
// public:
|
||||
// __device__ Dummy() {}
|
||||
// __device__ ~Dummy() {}
|
||||
// };
|
||||
// __global__ void func_name##f_kernel_v1(float* x) { float result = func_name##f(x); }
|
||||
// __global__ void func_name##f_kernel_v2(Dummy x) { float result = func_name##f(x); }
|
||||
// __global__ void func_name##_kernel_v1(double* x) { double result = func_name(x); }
|
||||
// __global__ void func_name##_kernel_v2(Dummy x) { double result = func_name(x); }
|
||||
|
||||
static constexpr auto kSin{R"(
|
||||
class Dummy {
|
||||
public:
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
__global__ void sinf_kernel_v1(float* x) { float result = sinf(x); }
|
||||
__global__ void sinf_kernel_v2(Dummy x) { float result = sinf(x); }
|
||||
__global__ void sin_kernel_v1(double* x) { double result = sin(x); }
|
||||
__global__ void sin_kernel_v2(Dummy x) { double result = sin(x); }
|
||||
)"};
|
||||
|
||||
static constexpr auto kCos{R"(
|
||||
class Dummy {
|
||||
public:
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
__global__ void cosf_kernel_v1(float* x) { float result = cosf(x); }
|
||||
__global__ void cosf_kernel_v2(Dummy x) { float result = cosf(x); }
|
||||
__global__ void cos_kernel_v1(double* x) { double result = cos(x); }
|
||||
__global__ void cos_kernel_v2(Dummy x) { double result = cos(x); }
|
||||
)"};
|
||||
|
||||
static constexpr auto kTan{R"(
|
||||
class Dummy {
|
||||
public:
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
__global__ void tanf_kernel_v1(float* x) { float result = tanf(x); }
|
||||
__global__ void tanf_kernel_v2(Dummy x) { float result = tanf(x); }
|
||||
__global__ void tan_kernel_v1(double* x) { double result = tan(x); }
|
||||
__global__ void tan_kernel_v2(Dummy x) { double result = tan(x); }
|
||||
)"};
|
||||
|
||||
static constexpr auto kAsin{R"(
|
||||
class Dummy {
|
||||
public:
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
__global__ void asinf_kernel_v1(float* x) { float result = asinf(x); }
|
||||
__global__ void asinf_kernel_v2(Dummy x) { float result = asinf(x); }
|
||||
__global__ void asin_kernel_v1(double* x) { double result = asin(x); }
|
||||
__global__ void asin_kernel_v2(Dummy x) { double result = asin(x); }
|
||||
)"};
|
||||
|
||||
static constexpr auto kAcos{R"(
|
||||
class Dummy {
|
||||
public:
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
__global__ void acosf_kernel_v1(float* x) { float result = acosf(x); }
|
||||
__global__ void acosf_kernel_v2(Dummy x) { float result = acosf(x); }
|
||||
__global__ void acos_kernel_v1(double* x) { double result = acos(x); }
|
||||
__global__ void acos_kernel_v2(Dummy x) { double result = acos(x); }
|
||||
)"};
|
||||
|
||||
static constexpr auto kAtan{R"(
|
||||
class Dummy {
|
||||
public:
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
__global__ void atanf_kernel_v1(float* x) { float result = atanf(x); }
|
||||
__global__ void atanf_kernel_v2(Dummy x) { float result = atanf(x); }
|
||||
__global__ void atan_kernel_v1(double* x) { double result = atan(x); }
|
||||
__global__ void atan_kernel_v2(Dummy x) { double result = atan(x); }
|
||||
)"};
|
||||
|
||||
static constexpr auto kSinh{R"(
|
||||
class Dummy {
|
||||
public:
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
__global__ void sinhf_kernel_v1(float* x) { float result = sinhf(x); }
|
||||
__global__ void sinhf_kernel_v2(Dummy x) { float result = sinhf(x); }
|
||||
__global__ void sinh_kernel_v1(double* x) { double result = sinh(x); }
|
||||
__global__ void sinh_kernel_v2(Dummy x) { double result = sinh(x); }
|
||||
)"};
|
||||
|
||||
static constexpr auto kCosh{R"(
|
||||
class Dummy {
|
||||
public:
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
__global__ void coshf_kernel_v1(float* x) { float result = coshf(x); }
|
||||
__global__ void coshf_kernel_v2(Dummy x) { float result = coshf(x); }
|
||||
__global__ void cosh_kernel_v1(double* x) { double result = cosh(x); }
|
||||
__global__ void cosh_kernel_v2(Dummy x) { double result = cosh(x); }
|
||||
)"};
|
||||
|
||||
static constexpr auto kTanh{R"(
|
||||
class Dummy {
|
||||
public:
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
__global__ void tanhf_kernel_v1(float* x) { float result = tanhf(x); }
|
||||
__global__ void tanhf_kernel_v2(Dummy x) { float result = tanhf(x); }
|
||||
__global__ void tanh_kernel_v1(double* x) { double result = tanh(x); }
|
||||
__global__ void tanh_kernel_v2(Dummy x) { double result = tanh(x); }
|
||||
)"};
|
||||
|
||||
static constexpr auto kAsinh{R"(
|
||||
class Dummy {
|
||||
public:
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
__global__ void asinhf_kernel_v1(float* x) { float result = asinhf(x); }
|
||||
__global__ void asinhf_kernel_v2(Dummy x) { float result = asinhf(x); }
|
||||
__global__ void asinh_kernel_v1(double* x) { double result = asinh(x); }
|
||||
__global__ void asinh_kernel_v2(Dummy x) { double result = asinh(x); }
|
||||
)"};
|
||||
|
||||
static constexpr auto kAcosh{R"(
|
||||
class Dummy {
|
||||
public:
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
__global__ void acoshf_kernel_v1(float* x) { float result = acoshf(x); }
|
||||
__global__ void acoshf_kernel_v2(Dummy x) { float result = acoshf(x); }
|
||||
__global__ void acosh_kernel_v1(double* x) { double result = acosh(x); }
|
||||
__global__ void acosh_kernel_v2(Dummy x) { double result = acosh(x); }
|
||||
)"};
|
||||
|
||||
static constexpr auto kAtanh{R"(
|
||||
class Dummy {
|
||||
public:
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
__global__ void atanhf_kernel_v1(float* x) { float result = atanhf(x); }
|
||||
__global__ void atanhf_kernel_v2(Dummy x) { float result = atanhf(x); }
|
||||
__global__ void atanh_kernel_v1(double* x) { double result = atanh(x); }
|
||||
__global__ void atanh_kernel_v2(Dummy x) { double result = atanh(x); }
|
||||
)"};
|
||||
|
||||
static constexpr auto kSinpi{R"(
|
||||
class Dummy {
|
||||
public:
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
__global__ void sinpif_kernel_v1(float* x) { float result = sinpif(x); }
|
||||
__global__ void sinpif_kernel_v2(Dummy x) { float result = sinpif(x); }
|
||||
__global__ void sinpi_kernel_v1(double* x) { double result = sinpi(x); }
|
||||
__global__ void sinpi_kernel_v2(Dummy x) { double result = sinpi(x); }
|
||||
)"};
|
||||
|
||||
static constexpr auto kCospi{R"(
|
||||
class Dummy {
|
||||
public:
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
__global__ void cospif_kernel_v1(float* x) { float result = cospif(x); }
|
||||
__global__ void cospif_kernel_v2(Dummy x) { float result = cospif(x); }
|
||||
__global__ void cospi_kernel_v1(double* x) { double result = cospi(x); }
|
||||
__global__ void cospi_kernel_v2(Dummy x) { double result = cospi(x); }
|
||||
)"};
|
||||
|
||||
static constexpr auto kAtan2{R"(
|
||||
class Dummy {
|
||||
public:
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
__global__ void atan2f_kernel_v1(float* x, float y) { float result = atan2f(x, y); }
|
||||
__global__ void atan2f_kernel_v2(float x, float* y) { float result = atan2f(x, y); }
|
||||
__global__ void atan2f_kernel_v3(Dummy x, float y) { float result = atan2f(x, y); }
|
||||
__global__ void atan2f_kernel_v4(float x, Dummy y) { float result = atan2f(x, y); }
|
||||
__global__ void atan2_kernel_v1(double* x, double y) { double result = atan2(x, y); }
|
||||
__global__ void atan2_kernel_v2(double x, double* y) { double result = atan2(x, y); }
|
||||
__global__ void atan2_kernel_v3(Dummy x, double y) { double result = atan2(x, y); }
|
||||
__global__ void atan2_kernel_v4(double x, Dummy y) { double result = atan2(x, y); }
|
||||
)"};
|
||||
|
||||
static constexpr auto kSincos{R"(
|
||||
class Dummy {
|
||||
public:
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
__global__ void sincosf_kernel_v1(float* x, float* sptr, float* cptr) { sincosf(x, sptr, cptr); }
|
||||
__global__ void sincosf_kernel_v2(Dummy x, float* sptr, float* cptr) { sincosf(x, sptr, cptr); }
|
||||
__global__ void sincosf_kernel_v3(float x, char* sptr, float* cptr) { sincosf(x, sptr, cptr); }
|
||||
__global__ void sincosf_kernel_v4(float x, short* sptr, float* cptr) { sincosf(x, sptr, cptr); }
|
||||
__global__ void sincosf_kernel_v5(float x, int* sptr, float* cptr) { sincosf(x, sptr, cptr); }
|
||||
__global__ void sincosf_kernel_v6(float x, long* sptr, float* cptr) { sincosf(x, sptr, cptr); }
|
||||
__global__ void sincosf_kernel_v7(float x, long long* sptr, float* cptr) { sincosf(x, sptr, cptr); }
|
||||
__global__ void sincosf_kernel_v8(float x, double* sptr, float* cptr) { sincosf(x, sptr, cptr); }
|
||||
__global__ void sincosf_kernel_v9(float x, Dummy* sptr, float* cptr) { sincosf(x, sptr, cptr); }
|
||||
__global__ void sincosf_kernel_v10(float x, const float* sptr, float* cptr) {
|
||||
sincosf(x, sptr, cptr);
|
||||
}
|
||||
__global__ void sincosf_kernel_v11(float x, float* sptr, char* cptr) { sincosf(x, sptr, cptr); }
|
||||
__global__ void sincosf_kernel_v12(float x, float* sptr, short* cptr) { sincosf(x, sptr, cptr); }
|
||||
__global__ void sincosf_kernel_v13(float x, float* sptr, int* cptr) { sincosf(x, sptr, cptr); }
|
||||
__global__ void sincosf_kernel_v14(float x, float* sptr, long* cptr) { sincosf(x, sptr, cptr); }
|
||||
__global__ void sincosf_kernel_v15(float x, float* sptr, long long* cptr) {
|
||||
sincosf(x, sptr, cptr);
|
||||
}
|
||||
__global__ void sincosf_kernel_v16(float x, float* sptr, double* cptr) { sincosf(x, sptr, cptr); }
|
||||
__global__ void sincosf_kernel_v17(float x, float* sptr, Dummy* cptr) { sincosf(x, sptr, cptr); }
|
||||
__global__ void sincosf_kernel_v18(float x, float* sptr, const float* cptr) {
|
||||
sincosf(x, sptr, cptr);
|
||||
}
|
||||
__global__ void sincos_kernel_v1(double* x, double* sptr, double* cptr) { sincos(x, sptr, cptr); }
|
||||
__global__ void sincos_kernel_v2(Dummy x, double* sptr, double* cptr) { sincos(x, sptr, cptr); }
|
||||
__global__ void sincos_kernel_v3(double x, char* sptr, double* cptr) { sincos(x, sptr, cptr); }
|
||||
__global__ void sincos_kernel_v4(double x, short* sptr, double* cptr) { sincos(x, sptr, cptr); }
|
||||
__global__ void sincos_kernel_v5(double x, int* sptr, double* cptr) { sincos(x, sptr, cptr); }
|
||||
__global__ void sincos_kernel_v6(double x, long* sptr, double* cptr) { sincos(x, sptr, cptr); }
|
||||
__global__ void sincos_kernel_v7(double x, long long* sptr, double* cptr) { sincos(x, sptr, cptr); }
|
||||
__global__ void sincos_kernel_v8(double x, float* sptr, double* cptr) { sincos(x, sptr, cptr); }
|
||||
__global__ void sincos_kernel_v9(double x, Dummy* sptr, double* cptr) { sincos(x, sptr, cptr); }
|
||||
__global__ void sincos_kernel_v10(double x, const double* sptr, double* cptr) {
|
||||
sincos(x, sptr, cptr);
|
||||
}
|
||||
__global__ void sincos_kernel_v11(double x, double* sptr, char* cptr) { sincos(x, sptr, cptr); }
|
||||
__global__ void sincos_kernel_v12(double x, double* sptr, short* cptr) { sincos(x, sptr, cptr); }
|
||||
__global__ void sincos_kernel_v13(double x, double* sptr, int* cptr) { sincos(x, sptr, cptr); }
|
||||
__global__ void sincos_kernel_v14(double x, double* sptr, long* cptr) { sincos(x, sptr, cptr); }
|
||||
__global__ void sincos_kernel_v15(double x, double* sptr, long long* cptr) {
|
||||
sincos(x, sptr, cptr);
|
||||
}
|
||||
__global__ void sincos_kernel_v16(double x, double* sptr, float* cptr) { sincos(x, sptr, cptr); }
|
||||
__global__ void sincos_kernel_v17(double x, double* sptr, Dummy* cptr) { sincos(x, sptr, cptr); }
|
||||
__global__ void sincos_kernel_v18(double x, double* sptr, const double* cptr) {
|
||||
sincos(x, sptr, cptr);
|
||||
}
|
||||
)"};
|
||||
|
||||
static constexpr auto kSincospi{R"(
|
||||
class Dummy {
|
||||
public:
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
__global__ void sincospif_kernel_v1(float* x, float* sptr, float* cptr) {
|
||||
sincospif(x, sptr, cptr);
|
||||
}
|
||||
__global__ void sincospif_kernel_v2(Dummy x, float* sptr, float* cptr) { sincospif(x, sptr, cptr); }
|
||||
__global__ void sincospif_kernel_v3(float x, char* sptr, float* cptr) { sincospif(x, sptr, cptr); }
|
||||
__global__ void sincospif_kernel_v4(float x, short* sptr, float* cptr) { sincospif(x, sptr, cptr); }
|
||||
__global__ void sincospif_kernel_v5(float x, int* sptr, float* cptr) { sincospif(x, sptr, cptr); }
|
||||
__global__ void sincospif_kernel_v6(float x, long* sptr, float* cptr) { sincospif(x, sptr, cptr); }
|
||||
__global__ void sincospif_kernel_v7(float x, long long* sptr, float* cptr) {
|
||||
sincospif(x, sptr, cptr);
|
||||
}
|
||||
__global__ void sincospif_kernel_v8(float x, double* sptr, float* cptr) {
|
||||
sincospif(x, sptr, cptr);
|
||||
}
|
||||
__global__ void sincospif_kernel_v9(float x, Dummy* sptr, float* cptr) { sincospif(x, sptr, cptr); }
|
||||
__global__ void sincospif_kernel_v10(float x, const float* sptr, float* cptr) {
|
||||
sincospif(x, sptr, cptr);
|
||||
}
|
||||
__global__ void sincospif_kernel_v11(float x, float* sptr, char* cptr) { sincospif(x, sptr, cptr); }
|
||||
__global__ void sincospif_kernel_v12(float x, float* sptr, short* cptr) {
|
||||
sincospif(x, sptr, cptr);
|
||||
}
|
||||
__global__ void sincospif_kernel_v13(float x, float* sptr, int* cptr) { sincospif(x, sptr, cptr); }
|
||||
__global__ void sincospif_kernel_v14(float x, float* sptr, long* cptr) { sincospif(x, sptr, cptr); }
|
||||
__global__ void sincospif_kernel_v15(float x, float* sptr, long long* cptr) {
|
||||
sincospif(x, sptr, cptr);
|
||||
}
|
||||
__global__ void sincospif_kernel_v16(float x, float* sptr, double* cptr) {
|
||||
sincospif(x, sptr, cptr);
|
||||
}
|
||||
__global__ void sincospif_kernel_v17(float x, float* sptr, Dummy* cptr) {
|
||||
sincospif(x, sptr, cptr);
|
||||
}
|
||||
__global__ void sincospif_kernel_v18(float x, float* sptr, const float* cptr) {
|
||||
sincospif(x, sptr, cptr);
|
||||
}
|
||||
__global__ void sincospi_kernel_v1(float* x, float* sptr, float* cptr) { sincospi(x, sptr, cptr); }
|
||||
__global__ void sincospi_kernel_v2(Dummy x, float* sptr, float* cptr) { sincospi(x, sptr, cptr); }
|
||||
__global__ void sincospi_kernel_v3(float x, char* sptr, float* cptr) { sincospi(x, sptr, cptr); }
|
||||
__global__ void sincospi_kernel_v4(float x, short* sptr, float* cptr) { sincospi(x, sptr, cptr); }
|
||||
__global__ void sincospi_kernel_v5(float x, int* sptr, float* cptr) { sincospi(x, sptr, cptr); }
|
||||
__global__ void sincospi_kernel_v6(float x, long* sptr, float* cptr) { sincospi(x, sptr, cptr); }
|
||||
__global__ void sincospi_kernel_v7(float x, long long* sptr, float* cptr) {
|
||||
sincospi(x, sptr, cptr);
|
||||
}
|
||||
__global__ void sincospi_kernel_v8(float x, double* sptr, float* cptr) { sincospi(x, sptr, cptr); }
|
||||
__global__ void sincospi_kernel_v9(float x, Dummy* sptr, float* cptr) { sincospi(x, sptr, cptr); }
|
||||
__global__ void sincospi_kernel_v10(float x, const float* sptr, float* cptr) {
|
||||
sincospi(x, sptr, cptr);
|
||||
}
|
||||
__global__ void sincospi_kernel_v11(float x, float* sptr, char* cptr) { sincospi(x, sptr, cptr); }
|
||||
__global__ void sincospi_kernel_v12(float x, float* sptr, short* cptr) { sincospi(x, sptr, cptr); }
|
||||
__global__ void sincospi_kernel_v13(float x, float* sptr, int* cptr) { sincospi(x, sptr, cptr); }
|
||||
__global__ void sincospi_kernel_v14(float x, float* sptr, long* cptr) { sincospi(x, sptr, cptr); }
|
||||
__global__ void sincospi_kernel_v15(float x, float* sptr, long long* cptr) {
|
||||
sincospi(x, sptr, cptr);
|
||||
}
|
||||
__global__ void sincospi_kernel_v16(float x, float* sptr, double* cptr) { sincospi(x, sptr, cptr); }
|
||||
__global__ void sincospi_kernel_v17(float x, float* sptr, Dummy* cptr) { sincospi(x, sptr, cptr); }
|
||||
__global__ void sincospi_kernel_v18(float x, float* sptr, const float* cptr) {
|
||||
sincospi(x, sptr, cptr);
|
||||
}
|
||||
)"};
|
||||
@@ -0,0 +1,118 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
|
||||
class Dummy {
|
||||
public:
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
|
||||
#define TRIG_SP_UNARY_NEGATIVE_KERNELS(func_name) \
|
||||
__global__ void func_name##f_kernel_v1(float* x) { float result = func_name##f(x); } \
|
||||
__global__ void func_name##f_kernel_v2(Dummy x) { float result = func_name##f(x); }
|
||||
|
||||
/*Expecting 2 errors per macro invocation - 26 total*/
|
||||
TRIG_SP_UNARY_NEGATIVE_KERNELS(sin)
|
||||
TRIG_SP_UNARY_NEGATIVE_KERNELS(cos)
|
||||
TRIG_SP_UNARY_NEGATIVE_KERNELS(tan)
|
||||
TRIG_SP_UNARY_NEGATIVE_KERNELS(asin)
|
||||
TRIG_SP_UNARY_NEGATIVE_KERNELS(acos)
|
||||
TRIG_SP_UNARY_NEGATIVE_KERNELS(atan)
|
||||
TRIG_SP_UNARY_NEGATIVE_KERNELS(sinh)
|
||||
TRIG_SP_UNARY_NEGATIVE_KERNELS(cosh)
|
||||
TRIG_SP_UNARY_NEGATIVE_KERNELS(tanh)
|
||||
TRIG_SP_UNARY_NEGATIVE_KERNELS(asinh)
|
||||
TRIG_SP_UNARY_NEGATIVE_KERNELS(atanh)
|
||||
TRIG_SP_UNARY_NEGATIVE_KERNELS(sinpi)
|
||||
TRIG_SP_UNARY_NEGATIVE_KERNELS(cospi)
|
||||
|
||||
/*Expecting 4 errors*/
|
||||
__global__ void atan2f_kernel_v1(float* x, float y) { float result = atan2f(x, y); }
|
||||
__global__ void atan2f_kernel_v2(float x, float* y) { float result = atan2f(x, y); }
|
||||
__global__ void atan2f_kernel_v3(Dummy x, float y) { float result = atan2f(x, y); }
|
||||
__global__ void atan2f_kernel_v4(float x, Dummy y) { float result = atan2f(x, y); }
|
||||
|
||||
/*Expecting 18 errors*/
|
||||
__global__ void sincosf_kernel_v1(float* x, float* sptr, float* cptr) { sincosf(x, sptr, cptr); }
|
||||
__global__ void sincosf_kernel_v2(Dummy x, float* sptr, float* cptr) { sincosf(x, sptr, cptr); }
|
||||
__global__ void sincosf_kernel_v3(float x, char* sptr, float* cptr) { sincosf(x, sptr, cptr); }
|
||||
__global__ void sincosf_kernel_v4(float x, short* sptr, float* cptr) { sincosf(x, sptr, cptr); }
|
||||
__global__ void sincosf_kernel_v5(float x, int* sptr, float* cptr) { sincosf(x, sptr, cptr); }
|
||||
__global__ void sincosf_kernel_v6(float x, long* sptr, float* cptr) { sincosf(x, sptr, cptr); }
|
||||
__global__ void sincosf_kernel_v7(float x, long long* sptr, float* cptr) { sincosf(x, sptr, cptr); }
|
||||
__global__ void sincosf_kernel_v8(float x, double* sptr, float* cptr) { sincosf(x, sptr, cptr); }
|
||||
__global__ void sincosf_kernel_v9(float x, Dummy* sptr, float* cptr) { sincosf(x, sptr, cptr); }
|
||||
__global__ void sincosf_kernel_v10(float x, const float* sptr, float* cptr) {
|
||||
sincosf(x, sptr, cptr);
|
||||
}
|
||||
__global__ void sincosf_kernel_v11(float x, float* sptr, char* cptr) { sincosf(x, sptr, cptr); }
|
||||
__global__ void sincosf_kernel_v12(float x, float* sptr, short* cptr) { sincosf(x, sptr, cptr); }
|
||||
__global__ void sincosf_kernel_v13(float x, float* sptr, int* cptr) { sincosf(x, sptr, cptr); }
|
||||
__global__ void sincosf_kernel_v14(float x, float* sptr, long* cptr) { sincosf(x, sptr, cptr); }
|
||||
__global__ void sincosf_kernel_v15(float x, float* sptr, long long* cptr) {
|
||||
sincosf(x, sptr, cptr);
|
||||
}
|
||||
__global__ void sincosf_kernel_v16(float x, float* sptr, double* cptr) { sincosf(x, sptr, cptr); }
|
||||
__global__ void sincosf_kernel_v17(float x, float* sptr, Dummy* cptr) { sincosf(x, sptr, cptr); }
|
||||
__global__ void sincosf_kernel_v18(float x, float* sptr, const float* cptr) {
|
||||
sincosf(x, sptr, cptr);
|
||||
}
|
||||
|
||||
/*Expecting 18 errors*/
|
||||
__global__ void sincospif_kernel_v1(float* x, float* sptr, float* cptr) {
|
||||
sincospif(x, sptr, cptr);
|
||||
}
|
||||
__global__ void sincospif_kernel_v2(Dummy x, float* sptr, float* cptr) { sincospif(x, sptr, cptr); }
|
||||
__global__ void sincospif_kernel_v3(float x, char* sptr, float* cptr) { sincospif(x, sptr, cptr); }
|
||||
__global__ void sincospif_kernel_v4(float x, short* sptr, float* cptr) { sincospif(x, sptr, cptr); }
|
||||
__global__ void sincospif_kernel_v5(float x, int* sptr, float* cptr) { sincospif(x, sptr, cptr); }
|
||||
__global__ void sincospif_kernel_v6(float x, long* sptr, float* cptr) { sincospif(x, sptr, cptr); }
|
||||
__global__ void sincospif_kernel_v7(float x, long long* sptr, float* cptr) {
|
||||
sincospif(x, sptr, cptr);
|
||||
}
|
||||
__global__ void sincospif_kernel_v8(float x, double* sptr, float* cptr) {
|
||||
sincospif(x, sptr, cptr);
|
||||
}
|
||||
__global__ void sincospif_kernel_v9(float x, Dummy* sptr, float* cptr) { sincospif(x, sptr, cptr); }
|
||||
__global__ void sincospif_kernel_v10(float x, const float* sptr, float* cptr) {
|
||||
sincospif(x, sptr, cptr);
|
||||
}
|
||||
__global__ void sincospif_kernel_v11(float x, float* sptr, char* cptr) { sincospif(x, sptr, cptr); }
|
||||
__global__ void sincospif_kernel_v12(float x, float* sptr, short* cptr) {
|
||||
sincospif(x, sptr, cptr);
|
||||
}
|
||||
__global__ void sincospif_kernel_v13(float x, float* sptr, int* cptr) { sincospif(x, sptr, cptr); }
|
||||
__global__ void sincospif_kernel_v14(float x, float* sptr, long* cptr) { sincospif(x, sptr, cptr); }
|
||||
__global__ void sincospif_kernel_v15(float x, float* sptr, long long* cptr) {
|
||||
sincospif(x, sptr, cptr);
|
||||
}
|
||||
__global__ void sincospif_kernel_v16(float x, float* sptr, double* cptr) {
|
||||
sincospif(x, sptr, cptr);
|
||||
}
|
||||
__global__ void sincospif_kernel_v17(float x, float* sptr, Dummy* cptr) {
|
||||
sincospif(x, sptr, cptr);
|
||||
}
|
||||
__global__ void sincospif_kernel_v18(float x, float* sptr, const float* cptr) {
|
||||
sincospif(x, sptr, cptr);
|
||||
}
|
||||
@@ -0,0 +1,198 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "math_common.hh"
|
||||
#include "math_special_values.hh"
|
||||
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
#define MATH_UNARY_KERNEL_DEF(func_name) \
|
||||
template <typename T, typename RT = T> \
|
||||
__global__ void func_name##_kernel(RT* const ys, const size_t num_xs, T* const xs) { \
|
||||
const auto tid = cg::this_grid().thread_rank(); \
|
||||
const auto stride = cg::this_grid().size(); \
|
||||
\
|
||||
for (auto i = tid; i < num_xs; i += stride) { \
|
||||
if constexpr (std::is_same_v<float, T>) { \
|
||||
ys[i] = func_name##f(xs[i]); \
|
||||
} else if constexpr (std::is_same_v<double, T>) { \
|
||||
ys[i] = func_name(xs[i]); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
template <typename T, typename RT, typename RTArg, typename ValidatorBuilder>
|
||||
void UnarySinglePrecisionBruteForceTest(kernel_sig<T, float> kernel, ref_sig<RT, RTArg> ref_func,
|
||||
const ValidatorBuilder& validator_builder) {
|
||||
const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
|
||||
uint64_t stop = std::numeric_limits<uint32_t>::max() + 1ul;
|
||||
const auto max_batch_size =
|
||||
std::min(GetMaxAllowedDeviceMemoryUsage() / (sizeof(float) + sizeof(T)), stop);
|
||||
LinearAllocGuard<float> values{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(float)};
|
||||
|
||||
MathTest math_test(kernel, max_batch_size);
|
||||
|
||||
auto batch_size = max_batch_size;
|
||||
const auto num_threads = thread_pool.thread_count();
|
||||
|
||||
for (uint64_t v = 0u; v < stop;) {
|
||||
batch_size = std::min<uint64_t>(max_batch_size, stop - v);
|
||||
|
||||
const auto min_sub_batch_size = batch_size / num_threads;
|
||||
const auto tail = batch_size % num_threads;
|
||||
|
||||
auto base_idx = 0u;
|
||||
for (auto i = 0u; i < num_threads; ++i) {
|
||||
const auto sub_batch_size = min_sub_batch_size + (i < tail);
|
||||
|
||||
thread_pool.Post([=, &values] {
|
||||
auto t = v;
|
||||
uint32_t val;
|
||||
for (auto j = 0u; j < sub_batch_size; ++j) {
|
||||
val = static_cast<uint32_t>(t++);
|
||||
values.ptr()[base_idx + j] = *reinterpret_cast<float*>(&val);
|
||||
}
|
||||
});
|
||||
|
||||
v += sub_batch_size;
|
||||
base_idx += sub_batch_size;
|
||||
}
|
||||
|
||||
thread_pool.Wait();
|
||||
|
||||
math_test.Run(validator_builder, grid_size, block_size, ref_func, batch_size, values.ptr());
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename RT, typename RTArg, typename ValidatorBuilder>
|
||||
void UnarySinglePrecisionRangeTest(kernel_sig<T, float> kernel, ref_sig<RT, RTArg> ref_func,
|
||||
const ValidatorBuilder& validator_builder, const float a,
|
||||
const float b) {
|
||||
const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
|
||||
uint64_t stop = std::numeric_limits<uint32_t>::max() + 1ul;
|
||||
const auto max_batch_size = GetMaxAllowedDeviceMemoryUsage() / (sizeof(float) + sizeof(T));
|
||||
LinearAllocGuard<float> values{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(float)};
|
||||
|
||||
MathTest math_test(kernel, max_batch_size);
|
||||
|
||||
uint32_t val = 0u;
|
||||
const auto num_threads = thread_pool.thread_count();
|
||||
|
||||
size_t inserted = 0u;
|
||||
for (float v = a; v != b; v = std::nextafter(v, b)) {
|
||||
values.ptr()[inserted++] = v;
|
||||
if (inserted < max_batch_size) continue;
|
||||
|
||||
math_test.Run(validator_builder, grid_size, block_size, ref_func, inserted, values.ptr());
|
||||
inserted = 0u;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename RT, typename RTArg, typename ValidatorBuilder>
|
||||
void UnaryDoublePrecisionBruteForceTest(kernel_sig<T, double> kernel, ref_sig<RT, RTArg> ref_func,
|
||||
const ValidatorBuilder& validator_builder,
|
||||
const double a = std::numeric_limits<double>::lowest(),
|
||||
const double b = std::numeric_limits<double>::max()) {
|
||||
const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
|
||||
const uint64_t num_iterations = GetTestIterationCount();
|
||||
const auto max_batch_size =
|
||||
std::min(GetMaxAllowedDeviceMemoryUsage() / (sizeof(double) + sizeof(T)), num_iterations);
|
||||
LinearAllocGuard<double> values{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(double)};
|
||||
|
||||
MathTest math_test(kernel, max_batch_size);
|
||||
|
||||
auto batch_size = max_batch_size;
|
||||
const auto num_threads = thread_pool.thread_count();
|
||||
for (uint64_t i = 0ul; i < num_iterations; i += batch_size) {
|
||||
batch_size = std::min<uint64_t>(max_batch_size, num_iterations - i);
|
||||
|
||||
const auto min_sub_batch_size = batch_size / num_threads;
|
||||
const auto tail = batch_size % num_threads;
|
||||
|
||||
auto base_idx = 0u;
|
||||
for (auto i = 0u; i < num_threads; ++i) {
|
||||
const auto sub_batch_size = min_sub_batch_size + (i < tail);
|
||||
thread_pool.Post([=, &values] {
|
||||
const auto generator = [=] {
|
||||
static thread_local std::mt19937 rng(std::random_device{}());
|
||||
std::uniform_real_distribution<long double> unif_dist(a, b);
|
||||
return static_cast<double>(unif_dist(rng));
|
||||
};
|
||||
std::generate(values.ptr() + base_idx, values.ptr() + base_idx + sub_batch_size, generator);
|
||||
});
|
||||
base_idx += sub_batch_size;
|
||||
}
|
||||
|
||||
thread_pool.Wait();
|
||||
|
||||
math_test.Run(validator_builder, grid_size, block_size, ref_func, batch_size, values.ptr());
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename RT, typename RTArg, typename ValidatorBuilder>
|
||||
void UnaryDoublePrecisionSpecialValuesTest(kernel_sig<T, double> kernel,
|
||||
ref_sig<RT, RTArg> ref_func,
|
||||
const ValidatorBuilder& validator_builder) {
|
||||
const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
|
||||
const auto values = std::get<SpecialVals<double>>(kSpecialValRegistry);
|
||||
|
||||
MathTest math_test(kernel, values.size);
|
||||
math_test.template Run<false>(validator_builder, grid_size, block_size, ref_func, values.size,
|
||||
values.data);
|
||||
}
|
||||
|
||||
template <typename T, typename RT, typename RTArg, typename ValidatorBuilder>
|
||||
void UnarySinglePrecisionTest(kernel_sig<T, float> kernel, ref_sig<RT, RTArg> ref,
|
||||
const ValidatorBuilder& validator_builder) {
|
||||
SECTION("Brute force") { UnarySinglePrecisionBruteForceTest(kernel, ref, validator_builder); }
|
||||
}
|
||||
|
||||
template <typename T, typename RT, typename RTArg, typename ValidatorBuilder>
|
||||
void UnaryDoublePrecisionTest(kernel_sig<T, double> kernel, ref_sig<RT, RTArg> ref,
|
||||
const ValidatorBuilder& validator_builder) {
|
||||
SECTION("Special values") {
|
||||
UnaryDoublePrecisionSpecialValuesTest(kernel, ref, validator_builder);
|
||||
}
|
||||
|
||||
SECTION("Brute force") { UnaryDoublePrecisionBruteForceTest(kernel, ref, validator_builder); }
|
||||
}
|
||||
|
||||
#define MATH_UNARY_WITHIN_ULP_TEST_DEF(kern_name, ref_func, sp_ulp, dp_ulp) \
|
||||
MATH_UNARY_KERNEL_DEF(kern_name) \
|
||||
\
|
||||
TEST_CASE("Unit_Device_" #kern_name "_Accuracy_Positive - float") { \
|
||||
double (*ref)(double) = ref_func; \
|
||||
UnarySinglePrecisionTest(kern_name##_kernel<float>, ref, \
|
||||
ULPValidatorBuilderFactory<float>(sp_ulp)); \
|
||||
} \
|
||||
\
|
||||
TEST_CASE("Unit_Device_" #kern_name "_Accuracy_Positive - double") { \
|
||||
long double (*ref)(long double) = ref_func; \
|
||||
UnaryDoublePrecisionTest(kern_name##_kernel<double>, ref, \
|
||||
ULPValidatorBuilderFactory<double>(dp_ulp)); \
|
||||
}
|
||||
|
||||
#define MATH_UNARY_WITHIN_ULP_STL_REF_TEST_DEF(func_name, sp_ulp, dp_ulp) \
|
||||
MATH_UNARY_WITHIN_ULP_TEST_DEF(func_name, std::func_name, sp_ulp, dp_ulp)
|
||||
@@ -0,0 +1,152 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <catch.hpp>
|
||||
|
||||
// Define a new MatcherBase class with a public 'describe' member function because
|
||||
// Catch::MatcherBase::describe is protected and thus can't be used via a pointer to
|
||||
// Catch::MatcherBase.
|
||||
template <typename T> class MatcherBase : public Catch::MatcherBase<T> {
|
||||
public:
|
||||
virtual std::string describe() const = 0;
|
||||
virtual ~MatcherBase() = default;
|
||||
};
|
||||
|
||||
template <typename T, typename Matcher> class ValidatorBase : public MatcherBase<T> {
|
||||
public:
|
||||
template <typename... Ts>
|
||||
ValidatorBase(T target, Ts&&... args) : matcher_{std::forward<Ts>(args)...}, target_{target} {}
|
||||
|
||||
bool match(const T& val) const override {
|
||||
if (std::isnan(target_)) {
|
||||
return std::isnan(val);
|
||||
}
|
||||
|
||||
return matcher_.match(val);
|
||||
}
|
||||
|
||||
std::string describe() const override {
|
||||
if (std::isnan(target_)) {
|
||||
return "is not NaN";
|
||||
}
|
||||
|
||||
return matcher_.describe();
|
||||
}
|
||||
|
||||
private:
|
||||
Matcher matcher_;
|
||||
T target_;
|
||||
bool nan = false;
|
||||
};
|
||||
|
||||
template <typename T> auto ULPValidatorBuilderFactory(int64_t ulps) {
|
||||
return [=](T target, auto&&... args) {
|
||||
return std::make_unique<ValidatorBase<T, Catch::Matchers::Floating::WithinUlpsMatcher>>(
|
||||
target, Catch::WithinULP(target, ulps));
|
||||
};
|
||||
};
|
||||
|
||||
template <typename T> auto AbsValidatorBuilderFactory(double margin) {
|
||||
return [=](T target, auto&&... args) {
|
||||
return std::make_unique<ValidatorBase<T, Catch::Matchers::Floating::WithinAbsMatcher>>(
|
||||
target, Catch::WithinAbs(target, margin));
|
||||
};
|
||||
}
|
||||
|
||||
template <typename T> auto RelValidatorBuilderFactory(T margin) {
|
||||
return [=](T target, auto&&... args) {
|
||||
return std::make_unique<ValidatorBase<T, Catch::Matchers::Floating::WithinRelMatcher>>(
|
||||
target, Catch::WithinRel(target, margin));
|
||||
};
|
||||
}
|
||||
|
||||
template <typename T> class EqValidator : public MatcherBase<T> {
|
||||
public:
|
||||
EqValidator(T target) : target_{target} {}
|
||||
|
||||
bool match(const T& val) const override {
|
||||
if (std::isnan(target_)) {
|
||||
return std::isnan(val);
|
||||
}
|
||||
|
||||
return target_ == val;
|
||||
}
|
||||
|
||||
std::string describe() const override {
|
||||
std::stringstream ss;
|
||||
ss << " is not equal to " << target_;
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
private:
|
||||
T target_;
|
||||
};
|
||||
|
||||
template <typename T> auto EqValidatorBuilderFactory() {
|
||||
return [](T val, auto&&... args) { return std::make_unique<EqValidator<T>>(val); };
|
||||
}
|
||||
|
||||
template <typename T, typename U, typename VBF, typename VBS>
|
||||
class PairValidator : public MatcherBase<std::pair<T, U>> {
|
||||
public:
|
||||
PairValidator(const std::pair<T, U>& target, const VBF& vbf, const VBS& vbs)
|
||||
: first_matcher_{vbf(target.first)}, second_matcher_{vbs(target.second)} {}
|
||||
|
||||
bool match(const std::pair<T, U>& val) const override {
|
||||
return first_matcher_->match(val.first) && second_matcher_->match(val.second);
|
||||
}
|
||||
|
||||
std::string describe() const override {
|
||||
return "<" + first_matcher_->describe() + ", " + second_matcher_->describe() + ">";
|
||||
}
|
||||
|
||||
private:
|
||||
decltype(std::declval<VBF>()(std::declval<T>())) first_matcher_;
|
||||
decltype(std::declval<VBS>()(std::declval<U>())) second_matcher_;
|
||||
};
|
||||
|
||||
template <typename T, typename ValidatorBuilder>
|
||||
auto PairValidatorBuilderFactory(const ValidatorBuilder& vb) {
|
||||
return [=](const std::pair<T, T>& t, auto&&... args) {
|
||||
return std::make_unique<PairValidator<T, T, ValidatorBuilder, ValidatorBuilder>>(t, vb, vb);
|
||||
};
|
||||
}
|
||||
|
||||
template <typename T, typename U, typename VBF, typename VBS>
|
||||
auto PairValidatorBuilderFactory(const VBF& vbf, const VBS& vbs) {
|
||||
return [=](const std::pair<T, U>& t, auto&&... args) {
|
||||
return std::make_unique<PairValidator<T, U, VBF, VBS>>(t, vbf, vbs);
|
||||
};
|
||||
}
|
||||
|
||||
template <typename T> class NopValidator : public MatcherBase<T> {
|
||||
public:
|
||||
bool match(const T& val) const override { return true; }
|
||||
|
||||
std::string describe() const override { return ""; }
|
||||
};
|
||||
|
||||
template <typename T> auto NopValidatorBuilderFactory() {
|
||||
return [](auto&&... args) { return std::make_unique<NopValidator<T>>(); };
|
||||
}
|
||||
Criar uma nova questão referindo esta
Bloquear um utilizador