EXSWHTEC-287 - Implement tests for trigonometric device math functions #231

Change-Id: I44188fa2f60f2492e05596464c914b8c739be482 [ROCm/hip-tests commit: 36620358e6]
2024-01-22 18:59:43 +05:30
@@ -19,8 +19,19 @@
 # THE SOFTWARE.

 set(TEST_SRC
+  trig_funcs.cc
 )

 hip_add_exe_to_target(NAME MathsTest
                      TEST_SRC ${TEST_SRC}
                      TEST_TARGET_NAME build_tests COMMON_SHARED_SRC ${COMMON_SHARED_SRC})
+
+add_test(NAME Unit_Device_Single_Precision_Trig_Functions_Negative
+         COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../compileAndCaptureOutput.py
+         ${CMAKE_CURRENT_SOURCE_DIR} ${HIP_PLATFORM} ${HIP_PATH}
+         trig_single_precision_negative_kernels.cc 66)
+
+add_test(NAME Unit_Device_Double_Precision_Trig_Functions_Negative
+         COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../compileAndCaptureOutput.py
+         ${CMAKE_CURRENT_SOURCE_DIR} ${HIP_PLATFORM} ${HIP_PATH}
+         trig_double_precision_negative_kernels.cc 66)
@@ -0,0 +1,136 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include "math_common.hh"
+#include "math_special_values.hh"
+
+#include <hip/hip_cooperative_groups.h>
+
+namespace cg = cooperative_groups;
+
+#define MATH_BINARY_KERNEL_DEF(func_name)                                                          \
+  template <typename T, typename RT = T>                                                           \
+  __global__ void func_name##_kernel(RT* const ys, const size_t num_xs, T* const x1s,              \
+                                     T* const x2s) {                                               \
+    const auto tid = cg::this_grid().thread_rank();                                                \
+    const auto stride = cg::this_grid().size();                                                    \
+                                                                                                   \
+    for (auto i = tid; i < num_xs; i += stride) {                                                  \
+      if constexpr (std::is_same_v<float, T>) {                                                    \
+        ys[i] = func_name##f(x1s[i], x2s[i]);                                                      \
+      } else if constexpr (std::is_same_v<double, T>) {                                            \
+        ys[i] = func_name(x1s[i], x2s[i]);                                                         \
+      }                                                                                            \
+    }                                                                                              \
+  }
+
+template <typename T, typename TArg, typename RT, typename RTArg, typename ValidatorBuilder>
+void BinaryFloatingPointBruteForceTest(kernel_sig<T, TArg, TArg> kernel,
+                                       ref_sig<RT, RTArg, RTArg> ref_func,
+                                       const ValidatorBuilder& validator_builder,
+                                       const TArg a = std::numeric_limits<TArg>::lowest(),
+                                       const TArg b = std::numeric_limits<TArg>::max()) {
+  const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
+  const uint64_t num_iterations = GetTestIterationCount();
+  const auto max_batch_size =
+      std::min(GetMaxAllowedDeviceMemoryUsage() / (sizeof(TArg) * 2 + sizeof(T)), num_iterations);
+  LinearAllocGuard<TArg> x1s{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(TArg)};
+  LinearAllocGuard<TArg> x2s{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(TArg)};
+
+  MathTest math_test(kernel, max_batch_size);
+
+  auto batch_size = max_batch_size;
+  const auto num_threads = thread_pool.thread_count();
+  for (uint64_t i = 0ul; i < num_iterations; i += batch_size) {
+    batch_size = std::min<uint64_t>(max_batch_size, num_iterations - i);
+
+    const auto min_sub_batch_size = batch_size / num_threads;
+    const auto tail = batch_size % num_threads;
+
+    auto base_idx = 0u;
+    for (auto i = 0u; i < num_threads; ++i) {
+      const auto sub_batch_size = min_sub_batch_size + (i < tail);
+      thread_pool.Post([=, &x1s, &x2s] {
+        const auto generator = [=] {
+          static thread_local std::mt19937 rng(std::random_device{}());
+          std::uniform_real_distribution<RefType_t<TArg>> unif_dist(a, b);
+          return static_cast<TArg>(unif_dist(rng));
+        };
+        std::generate(x1s.ptr() + base_idx, x1s.ptr() + base_idx + sub_batch_size, generator);
+        std::generate(x2s.ptr() + base_idx, x2s.ptr() + base_idx + sub_batch_size, generator);
+      });
+      base_idx += sub_batch_size;
+    }
+
+    thread_pool.Wait();
+
+    math_test.Run(validator_builder, grid_size, block_size, ref_func, batch_size, x1s.ptr(),
+                  x2s.ptr());
+  }
+}
+
+template <typename T, typename TArg, typename RT, typename RTArg, typename ValidatorBuilder>
+void BinaryFloatingPointSpecialValuesTest(kernel_sig<T, TArg, TArg> kernel,
+                                          ref_sig<RT, RTArg, RTArg> ref_func,
+                                          const ValidatorBuilder& validator_builder) {
+  const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
+  const auto values = std::get<SpecialVals<TArg>>(kSpecialValRegistry);
+
+  const auto size = values.size * values.size;
+  LinearAllocGuard<TArg> x1s{LinearAllocs::hipHostMalloc, size * sizeof(TArg)};
+  LinearAllocGuard<TArg> x2s{LinearAllocs::hipHostMalloc, size * sizeof(TArg)};
+
+  for (auto i = 0u; i < values.size; ++i) {
+    for (auto j = 0u; j < values.size; ++j) {
+      x1s.ptr()[i * values.size + j] = values.data[i];
+      x2s.ptr()[i * values.size + j] = values.data[j];
+    }
+  }
+
+  MathTest math_test(kernel, size);
+  math_test.template Run<false>(validator_builder, grid_size, block_size, ref_func, size, x1s.ptr(),
+                                x2s.ptr());
+}
+
+template <typename T, typename TArg, typename RT, typename RTArg, typename ValidatorBuilder>
+void BinaryFloatingPointTest(kernel_sig<T, TArg, TArg> kernel, ref_sig<RT, RTArg, RTArg> ref_func,
+                             const ValidatorBuilder& validator_builder) {
+  SECTION("Special values") {
+    BinaryFloatingPointSpecialValuesTest(kernel, ref_func, validator_builder);
+  }
+
+  SECTION("Brute force") { BinaryFloatingPointBruteForceTest(kernel, ref_func, validator_builder); }
+}
+
+
+#define MATH_BINARY_WITHIN_ULP_TEST_DEF(kern_name, ref_func, sp_ulp, dp_ulp)                       \
+  MATH_BINARY_KERNEL_DEF(kern_name)                                                                \
+                                                                                                   \
+  TEMPLATE_TEST_CASE("Unit_Device_" #kern_name "_Accuracy_Positive", "", float, double) {          \
+    using RT = RefType_t<TestType>;                                                                \
+    RT (*ref)(RT, RT) = ref_func;                                                                  \
+    const auto ulp = std::is_same_v<float, TestType> ? sp_ulp : dp_ulp;                            \
+                                                                                                   \
+    BinaryFloatingPointTest(kern_name##_kernel<TestType>, ref,                                     \
+                            ULPValidatorBuilderFactory<TestType>(ulp));                            \
+  }
@@ -0,0 +1,240 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <cmd_options.hh>
+#include <hip_test_common.hh>
+#include <resource_guards.hh>
+
+#include <hip/hip_cooperative_groups.h>
+
+#include "thread_pool.hh"
+#include "validators.hh"
+
+namespace cg = cooperative_groups;
+
+template <typename T, typename U>
+std::enable_if_t<std::conjunction_v<std::is_arithmetic<T>, std::is_arithmetic<U>>, std::ostream&>
+operator<<(std::ostream& os, const std::pair<T, U>& p) {
+  const auto default_prec = os.precision();
+  return os << "<" << std::setprecision(std::numeric_limits<T>::max_digits10 - 1) << p.first << ", "
+            << std::setprecision(std::numeric_limits<U>::max_digits10 - 1) << p.second << ">"
+            << std::setprecision(default_prec);
+}
+
+// This class represents a generic numerical accuracy math test. Template parameter T is the output
+// type of the function being tested, and template parameter pack Ts represents the input types. The
+// constructor takes a kernel with the signature void(T*, const size_t, Ts*...). The first kernel
+// parameter is the output array, the second parameter is the number of outputs, and the rest of the
+// parameters are arrays containing input values. The number of input arrays depends on the arity of
+// the function being tested e.g. one input array for unary functions, two input arrays for binary
+// functions, etc. The kernel threads take one element from each input array at the index
+// corresponding to that thread, feed the input elements to the testee function, and store the
+// result in the output array at the corresponding index.
+//
+// E.g. for a binary function the kernel would have the following signature:
+//   void kernel(float* y, const size_t n, float* x1, float* x2)
+//
+// The outputs would be calculated in parallel the following way:
+//   y[0] = testee(x1[0], x2[0])
+//   y[1] = testee(x1[1], x2[1])
+//   y[2] = testee(x1[2], x2[2])
+//   ...
+//
+// The constructor also takes max_num_args, which represents the maximum number of input values used
+// for one kernel launch. The device memory for the input and output arrays is allocated based on
+// that number.
+template <typename T, typename... Ts> class MathTest {
+ public:
+  MathTest(void (*kernel)(T*, const size_t, Ts*...), const size_t max_num_args)
+      : kernel_{kernel},
+        xss_dev_(LinearAllocGuard<Ts>(LinearAllocs::hipMalloc, max_num_args * sizeof(Ts))...),
+        y_dev_{LinearAllocs::hipMalloc, max_num_args * sizeof(T)},
+        y_{LinearAllocs::hipHostMalloc, max_num_args * sizeof(T)} {}
+
+  // This method runs the test with the following steps:
+  // 1. Copy the values from the input arrays provided in the parameter pack xss to device memory
+  // 2. Launch the kernel using the configuration provided in grid_dims and block_dims
+  // 3. Copy the outputs back to host memory
+  // 4. Generate the reference values using ref_func and compare against the outputs using the
+  // validator provided by validator_builder
+  // 5. If non-type template parameter parallel is true, then step 4 is broken up into chunks of
+  // work that are done in parallel on the host.
+  template <bool parallel = true, typename RT, typename ValidatorBuilder, typename... RTs>
+  void Run(const ValidatorBuilder& validator_builder, const size_t grid_dims,
+           const size_t block_dims, RT (*const ref_func)(RTs...), const size_t num_args,
+           const Ts*... xss) {
+    fail_flag_.store(false);
+    error_info_.clear();
+    RunImpl<parallel>(validator_builder, grid_dims, block_dims, ref_func, num_args,
+                      std::index_sequence_for<Ts...>{}, xss...);
+  }
+
+ private:
+  void (*kernel_)(T*, const size_t, Ts*...);
+  std::tuple<LinearAllocGuard<Ts>...> xss_dev_;
+  LinearAllocGuard<T> y_dev_;
+  LinearAllocGuard<T> y_;
+  std::atomic<bool> fail_flag_{false};
+  std::mutex mtx_;
+  std::string error_info_;
+
+  template <bool parallel, typename RT, typename ValidatorBuilder, typename... RTs, size_t... I>
+  void RunImpl(const ValidatorBuilder& validator_builder, const size_t grid_dim,
+               const size_t block_dim, RT (*const ref_func)(RTs...), const size_t num_args,
+               std::index_sequence<I...> is, const Ts*... xss) {
+    const auto xss_tup = std::make_tuple(xss...);
+
+    constexpr auto f = [](auto dst, auto src, size_t size) {
+      HIP_CHECK(hipMemcpy(dst, src, size, hipMemcpyHostToDevice))
+    };
+
+    ((f(std::get<I>(xss_dev_).ptr(), std::get<I>(xss_tup),
+        num_args * sizeof(*std::get<I>(xss_tup)))),
+     ...);
+
+    kernel_<<<grid_dim, block_dim>>>(y_dev_.ptr(), num_args, std::get<I>(xss_dev_).ptr()...);
+    HIP_CHECK(hipGetLastError());
+
+    HIP_CHECK(hipMemcpy(y_.ptr(), y_dev_.ptr(), num_args * sizeof(T), hipMemcpyDeviceToHost));
+    HIP_CHECK(hipStreamSynchronize(nullptr));
+
+    if constexpr (!parallel) {
+      for (auto i = 0u; i < num_args; ++i) {
+        const auto actual_val = y_.ptr()[i];
+        const auto ref_val = static_cast<T>(ref_func(xss[i]...));
+        const auto validator = validator_builder(ref_val, xss[i]...);
+
+        if (!validator->match(actual_val)) {
+          const auto log = MakeLogMessage(actual_val, xss[i]...) + validator->describe() + "\n";
+          INFO(log);
+          REQUIRE(false);
+        }
+      }
+
+      return;
+    }
+
+    const auto task = [&, this](size_t iters, size_t base_idx) {
+      for (auto i = 0u; i < iters; ++i) {
+        if (fail_flag_.load(std::memory_order_relaxed)) return;
+
+        const auto actual_val = y_.ptr()[base_idx + i];
+        const auto ref_val = static_cast<T>(ref_func(xss[base_idx + i]...));
+        const auto validator = validator_builder(ref_val, xss[base_idx + i]...);
+
+        if (!validator->match(actual_val)) {
+          fail_flag_.store(true, std::memory_order_relaxed);
+          // Several threads might have passed the first check, but failed validation. On the
+          // chance of this happening, access to the string stream must be serialized.
+          const auto log =
+              MakeLogMessage(actual_val, xss[base_idx + i]...) + validator->describe() + "\n";
+          {
+            std::lock_guard lg{mtx_};
+            error_info_ += log;
+          }
+          return;
+        }
+      }
+    };
+
+    const auto task_count = thread_pool.thread_count();
+    const auto chunk_size = num_args / task_count;
+    const auto tail = num_args % task_count;
+
+    auto base_idx = 0u;
+    for (auto i = 0u; i < task_count; ++i) {
+      const auto iters = chunk_size + (i < tail);
+      thread_pool.Post([=, &task] { task(iters, base_idx); });
+      base_idx += iters;
+    }
+
+    thread_pool.Wait();
+
+    INFO(error_info_);
+    REQUIRE(!fail_flag_);
+  }
+
+  template <typename... Args> std::string MakeLogMessage(T actual_val, Args... args) {
+    std::stringstream ss;
+    ss << "Input value(s): " << std::scientific
+       << std::setprecision(std::numeric_limits<T>::max_digits10 - 1);
+    ((ss << " " << args), ...) << "\n" << actual_val << " ";
+
+    return ss.str();
+  }
+};
+
+template <typename T> struct RefType {};
+
+template <> struct RefType<float> { using type = double; };
+
+template <> struct RefType<double> { using type = long double; };
+
+template <typename T> using RefType_t = typename RefType<T>::type;
+
+template <typename F> auto GetOccupancyMaxPotentialBlockSize(F kernel) {
+  int grid_size = 0, block_size = 0;
+  HIP_CHECK(hipOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, 0, 0));
+  return std::make_tuple(grid_size, block_size);
+}
+
+inline size_t GetMaxAllowedDeviceMemoryUsage() {
+  hipDeviceProp_t props;
+  HIP_CHECK(hipGetDeviceProperties(&props, 0));
+  return props.totalGlobalMem * (cmd_options.accuracy_max_memory * 0.01f);
+}
+
+inline uint64_t GetTestIterationCount() { return cmd_options.accuracy_iterations; }
+
+template <typename T, typename... Ts> using kernel_sig = void (*)(T*, const size_t, Ts*...);
+
+template <typename T, typename... Ts> using ref_sig = T (*)(Ts...);
+
+template <int error_num> void NegativeTestRTCWrapper(const char* program_source) {
+  hiprtcProgram program{};
+
+  HIPRTC_CHECK(
+      hiprtcCreateProgram(&program, program_source, "math_test_rtc.cc", 0, nullptr, nullptr));
+  hiprtcResult result{hiprtcCompileProgram(program, 0, nullptr)};
+
+  // Get the compile log and count compiler error messages
+  size_t log_size{};
+  HIPRTC_CHECK(hiprtcGetProgramLogSize(program, &log_size));
+  std::string log(log_size, ' ');
+  HIPRTC_CHECK(hiprtcGetProgramLog(program, log.data()));
+  int error_count{0};
+
+  int expected_error_count{error_num};
+  std::string error_message{"error:"};
+
+  size_t n_pos = log.find(error_message, 0);
+  while (n_pos != std::string::npos) {
+    ++error_count;
+    n_pos = log.find(error_message, n_pos + 1);
+  }
+
+  HIPRTC_CHECK(hiprtcDestroyProgram(&program));
+  HIPRTC_CHECK_ERROR(result, HIPRTC_ERROR_COMPILATION);
+  REQUIRE(error_count == expected_error_count);
+}
@@ -0,0 +1,287 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+// Disclaimer:
+// This code is based on the work found in OpenCL-CTS authored by The Khronos Group.
+// The original code can be found at https://github.com/KhronosGroup/OpenCL-CTS.
+// We acknowledge the contributions of The Khronos Group to the development of this code.
+
+#pragma once
+
+#include <array>
+#include <limits>
+
+/*-----------------------------------------------------------------------------
+   HEX_FLT, HEXT_DBL, HEX_LDBL -- Create hex floating point literal of type
+   float, double, long double respectively. Arguments:
+
+      sm    -- sign of number,
+      int   -- integer part of mantissa (without `0x' prefix),
+      fract -- fractional part of mantissa (without decimal point and `L' or
+            `LL' suffixes),
+      se    -- sign of exponent,
+      exp   -- absolute value of (binary) exponent.
+
+   Example:
+
+      double yhi = HEX_DBL(+, 1, 5555555555555, -, 2); // 0x1.5555555555555p-2
+
+   Note:
+
+      We have to pass signs as separate arguments because gcc pass negative
+   integer values (e. g. `-2') into a macro as two separate tokens, so
+   `HEX_FLT(1, 0, -2)' produces result `0x1.0p- 2' (note a space between minus
+   and two) which is not a correct floating point literal.
+-----------------------------------------------------------------------------*/
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+// If compiler does not support hex floating point literals:
+#define HEX_FLT(sm, int, fract, se, exp)                                                           \
+  sm ldexpf((float)(0x##int##fract##UL),                                                           \
+            se exp + ilogbf((float)0x##int) - ilogbf((float)(0x##int##fract##UL)))
+#define HEX_DBL(sm, int, fract, se, exp)                                                           \
+  sm ldexp((double)(0x##int##fract##ULL),                                                          \
+           se exp + ilogb((double)0x##int) - ilogb((double)(0x##int##fract##ULL)))
+#define HEX_LDBL(sm, int, fract, se, exp)                                                          \
+  sm ldexpl((long double)(0x##int##fract##ULL),                                                    \
+            se exp + ilogbl((long double)0x##int) - ilogbl((long double)(0x##int##fract##ULL)))
+#else
+// If compiler supports hex floating point literals: just concatenate all the
+// parts into a literal.
+#define HEX_FLT(sm, int, fract, se, exp) sm 0x##int##.##fract##p##se##exp##F
+#define HEX_DBL(sm, int, fract, se, exp) sm 0x##int##.##fract##p##se##exp
+#define HEX_LDBL(sm, int, fract, se, exp) sm 0x##int##.##fract##p##se##exp##L
+#endif
+
+inline constexpr std::array kSpecialValuesDouble{
+    -std::numeric_limits<double>::quiet_NaN(),
+    -std::numeric_limits<double>::infinity(),
+    -std::numeric_limits<double>::max(),
+    HEX_DBL(-, 1, 0000000000001, +, 64),
+    HEX_DBL(-, 1, 0, +, 64),
+    HEX_DBL(-, 1, fffffffffffff, +, 63),
+    HEX_DBL(-, 1, 0000000000001, +, 63),
+    HEX_DBL(-, 1, 0, +, 63),
+    HEX_DBL(-, 1, fffffffffffff, +, 62),
+    HEX_DBL(-, 1, 000002, +, 32),
+    HEX_DBL(-, 1, 0, +, 32),
+    HEX_DBL(-, 1, fffffffffffff, +, 31),
+    HEX_DBL(-, 1, 0000000000001, +, 31),
+    HEX_DBL(-, 1, 0, +, 31),
+    HEX_DBL(-, 1, fffffffffffff, +, 30),
+    -1000.0,
+    -100.0,
+    -4.0,
+    -3.5,
+    -3.0,
+    HEX_DBL(-, 1, 8000000000001, +, 1),
+    -2.5,
+    HEX_DBL(-, 1, 7ffffffffffff, +, 1),
+    -2.0,
+    HEX_DBL(-, 1, 8000000000001, +, 0),
+    -1.5,
+    HEX_DBL(-, 1, 7ffffffffffff, +, 0),
+    HEX_DBL(-, 1, 0000000000001, +, 0),
+    -1.0,
+    HEX_DBL(-, 1, fffffffffffff, -, 1),
+    HEX_DBL(-, 1, 0000000000001, -, 1),
+    -0.5,
+    HEX_DBL(-, 1, fffffffffffff, -, 2),
+    HEX_DBL(-, 1, 0000000000001, -, 2),
+    -0.25,
+    HEX_DBL(-, 1, fffffffffffff, -, 3),
+    HEX_DBL(-, 1, 0000000000001, -, 1022),
+    -std::numeric_limits<double>::min(),
+    HEX_DBL(-, 0, fffffffffffff, -, 1022),
+    HEX_DBL(-, 0, 0000000000fff, -, 1022),
+    HEX_DBL(-, 0, 00000000000fe, -, 1022),
+    HEX_DBL(-, 0, 000000000000e, -, 1022),
+    HEX_DBL(-, 0, 000000000000c, -, 1022),
+    HEX_DBL(-, 0, 000000000000a, -, 1022),
+    HEX_DBL(-, 0, 0000000000008, -, 1022),
+    HEX_DBL(-, 0, 0000000000007, -, 1022),
+    HEX_DBL(-, 0, 0000000000006, -, 1022),
+    HEX_DBL(-, 0, 0000000000005, -, 1022),
+    HEX_DBL(-, 0, 0000000000004, -, 1022),
+    HEX_DBL(-, 0, 0000000000003, -, 1022),
+    HEX_DBL(-, 0, 0000000000002, -, 1022),
+    HEX_DBL(-, 0, 0000000000001, -, 1022),
+    -0.0,
+
+    std::numeric_limits<double>::quiet_NaN(),
+    std::numeric_limits<double>::infinity(),
+    std::numeric_limits<double>::max(),
+    HEX_DBL(+, 1, 0000000000001, +, 64),
+    HEX_DBL(+, 1, 0, +, 64),
+    HEX_DBL(+, 1, fffffffffffff, +, 63),
+    HEX_DBL(+, 1, 0000000000001, +, 63),
+    HEX_DBL(+, 1, 0, +, 63),
+    HEX_DBL(+, 1, fffffffffffff, +, 62),
+    HEX_DBL(+, 1, 000002, +, 32),
+    HEX_DBL(+, 1, 0, +, 32),
+    HEX_DBL(+, 1, fffffffffffff, +, 31),
+    HEX_DBL(+, 1, 0000000000001, +, 31),
+    HEX_DBL(+, 1, 0, +, 31),
+    HEX_DBL(+, 1, fffffffffffff, +, 30),
+    +1000.0,
+    +100.0,
+    +4.0,
+    +3.5,
+    +3.0,
+    HEX_DBL(+, 1, 8000000000001, +, 1),
+    +2.5,
+    HEX_DBL(+, 1, 7ffffffffffff, +, 1),
+    +2.0,
+    HEX_DBL(+, 1, 8000000000001, +, 0),
+    +1.5,
+    HEX_DBL(+, 1, 7ffffffffffff, +, 0),
+    HEX_DBL(+, 1, 0000000000001, +, 0),
+    +1.0,
+    HEX_DBL(+, 1, fffffffffffff, -, 1),
+    HEX_DBL(+, 1, 0000000000001, -, 1),
+    +0.5,
+    HEX_DBL(+, 1, fffffffffffff, -, 2),
+    HEX_DBL(+, 1, 0000000000001, -, 2),
+    +0.25,
+    HEX_DBL(+, 1, fffffffffffff, -, 3),
+    HEX_DBL(+, 1, 0000000000001, -, 1022),
+    +std::numeric_limits<double>::min(),
+    HEX_DBL(+, 0, fffffffffffff, -, 1022),
+    HEX_DBL(+, 0, 0000000000fff, -, 1022),
+    HEX_DBL(+, 0, 00000000000fe, -, 1022),
+    HEX_DBL(+, 0, 000000000000e, -, 1022),
+    HEX_DBL(+, 0, 000000000000c, -, 1022),
+    HEX_DBL(+, 0, 000000000000a, -, 1022),
+    HEX_DBL(+, 0, 0000000000008, -, 1022),
+    HEX_DBL(+, 0, 0000000000007, -, 1022),
+    HEX_DBL(+, 0, 0000000000006, -, 1022),
+    HEX_DBL(+, 0, 0000000000005, -, 1022),
+    HEX_DBL(+, 0, 0000000000004, -, 1022),
+    HEX_DBL(+, 0, 0000000000003, -, 1022),
+    HEX_DBL(+, 0, 0000000000002, -, 1022),
+    HEX_DBL(+, 0, 0000000000001, -, 1022),
+    +0.0,
+};
+
+inline constexpr std::array kSpecialValuesFloat{
+    -std::numeric_limits<float>::quiet_NaN(),
+    -std::numeric_limits<float>::infinity(),
+    -std::numeric_limits<float>::max(),
+    HEX_FLT(-, 1, 000002, +, 64),
+    HEX_FLT(-, 1, 0, +, 64),
+    HEX_FLT(-, 1, fffffe, +, 63),
+    HEX_FLT(-, 1, 000002, +, 63),
+    HEX_FLT(-, 1, 0, +, 63),
+    HEX_FLT(-, 1, fffffe, +, 62),
+    HEX_FLT(-, 1, 000002, +, 32),
+    HEX_FLT(-, 1, 0, +, 32),
+    HEX_FLT(-, 1, fffffe, +, 31),
+    HEX_FLT(-, 1, 000002, +, 31),
+    HEX_FLT(-, 1, 0, +, 31),
+    HEX_FLT(-, 1, fffffe, +, 30),
+    -1000.f,
+    -100.f,
+    -4.0f,
+    -3.5f,
+    -3.0f,
+    HEX_FLT(-, 1, 800002, +, 1),
+    -2.5f,
+    HEX_FLT(-, 1, 7ffffe, +, 1),
+    -2.0f,
+    HEX_FLT(-, 1, 800002, +, 0),
+    -1.5f,
+    HEX_FLT(-, 1, 7ffffe, +, 0),
+    HEX_FLT(-, 1, 000002, +, 0),
+    -1.0f,
+    HEX_FLT(-, 1, fffffe, -, 1),
+    HEX_FLT(-, 1, 000002, -, 1),
+    -0.5f,
+    HEX_FLT(-, 1, fffffe, -, 2),
+    HEX_FLT(-, 1, 000002, -, 2),
+    -0.25f,
+    HEX_FLT(-, 1, fffffe, -, 3),
+    HEX_FLT(-, 1, 000002, -, 126),
+    -std::numeric_limits<float>::min(),
+    HEX_FLT(-, 0, fffffe, -, 126),
+    HEX_FLT(-, 0, 000ffe, -, 126),
+    HEX_FLT(-, 0, 0000fe, -, 126),
+    HEX_FLT(-, 0, 00000e, -, 126),
+    HEX_FLT(-, 0, 00000c, -, 126),
+    HEX_FLT(-, 0, 00000a, -, 126),
+    HEX_FLT(-, 0, 000008, -, 126),
+    HEX_FLT(-, 0, 000006, -, 126),
+    HEX_FLT(-, 0, 000004, -, 126),
+    HEX_FLT(-, 0, 000002, -, 126),
+    -0.0f,
+
+    std::numeric_limits<float>::quiet_NaN(),
+    std::numeric_limits<float>::infinity(),
+    std::numeric_limits<float>::max(),
+    HEX_FLT(+, 1, 000002, +, 64),
+    HEX_FLT(+, 1, 0, +, 64),
+    HEX_FLT(+, 1, fffffe, +, 63),
+    HEX_FLT(+, 1, 000002, +, 63),
+    HEX_FLT(+, 1, 0, +, 63),
+    HEX_FLT(+, 1, fffffe, +, 62),
+    HEX_FLT(+, 1, 000002, +, 32),
+    HEX_FLT(+, 1, 0, +, 32),
+    HEX_FLT(+, 1, fffffe, +, 31),
+    HEX_FLT(+, 1, 000002, +, 31),
+    HEX_FLT(+, 1, 0, +, 31),
+    HEX_FLT(+, 1, fffffe, +, 30),
+    +1000.f,
+    +100.f,
+    +4.0f,
+    +3.5f,
+    +3.0f,
+    HEX_FLT(+, 1, 800002, +, 1),
+    2.5f,
+    HEX_FLT(+, 1, 7ffffe, +, 1),
+    +2.0f,
+    HEX_FLT(+, 1, 800002, +, 0),
+    1.5f,
+    HEX_FLT(+, 1, 7ffffe, +, 0),
+    HEX_FLT(+, 1, 000002, +, 0),
+    +1.0f,
+    HEX_FLT(+, 1, fffffe, -, 1),
+    HEX_FLT(+, 1, 000002, -, 1),
+    +0.5f,
+    HEX_FLT(+, 1, fffffe, -, 2),
+    HEX_FLT(+, 1, 000002, -, 2),
+    +0.25f,
+    HEX_FLT(+, 1, fffffe, -, 3),
+    HEX_FLT(+, 1, 000002, -, 126),
+    +std::numeric_limits<float>::min(),
+    HEX_FLT(+, 0, fffffe, -, 126),
+    HEX_FLT(+, 0, 000ffe, -, 126),
+    HEX_FLT(+, 0, 0000fe, -, 126),
+    HEX_FLT(+, 0, 00000e, -, 126),
+    HEX_FLT(+, 0, 00000c, -, 126),
+    HEX_FLT(+, 0, 00000a, -, 126),
+    HEX_FLT(+, 0, 000008, -, 126),
+    HEX_FLT(+, 0, 000006, -, 126),
+    HEX_FLT(+, 0, 000004, -, 126),
+    HEX_FLT(+, 0, 000002, -, 126),
+    +0.0f,
+};
+
+template <typename T> struct SpecialVals {
+  const T* const data;
+  const size_t size;
+};
+
+inline constexpr auto kSpecialValRegistry =
+    std::make_tuple(SpecialVals<float>{kSpecialValuesFloat.data(), kSpecialValuesFloat.size()},
+                    SpecialVals<double>{kSpecialValuesDouble.data(), kSpecialValuesDouble.size()});
@@ -0,0 +1,246 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include "math_common.hh"
+#include "math_special_values.hh"
+
+#include <hip/hip_cooperative_groups.h>
+
+namespace cg = cooperative_groups;
+
+#define MATH_QUATERNARY_KERNEL_DEF(func_name)                                                      \
+  template <typename T>                                                                            \
+  __global__ void func_name##_kernel(T* const ys, const size_t num_xs, T* const x1s, T* const x2s, \
+                                     T* const x3s, T* const x4s) {                                 \
+    const auto tid = cg::this_grid().thread_rank();                                                \
+    const auto stride = cg::this_grid().size();                                                    \
+                                                                                                   \
+    for (auto i = tid; i < num_xs; i += stride) {                                                  \
+      if constexpr (std::is_same_v<float, T>) {                                                    \
+        ys[i] = func_name##f(x1s[i], x2s[i], x3s[i], x4s[i]);                                      \
+      } else if constexpr (std::is_same_v<double, T>) {                                            \
+        ys[i] = func_name(x1s[i], x2s[i], x3s[i], x4s[i]);                                         \
+      }                                                                                            \
+    }                                                                                              \
+  }
+
+inline constexpr std::array kSpecialValuesReducedDouble{
+    -std::numeric_limits<double>::quiet_NaN(),
+    -std::numeric_limits<double>::infinity(),
+    -std::numeric_limits<double>::max(),
+    HEX_DBL(-, 1, 0000000000001, +, 64),
+    HEX_DBL(-, 1, fffffffffffff, +, 63),
+    HEX_DBL(-, 1, fffffffffffff, +, 62),
+    HEX_DBL(-, 1, 0, +, 32),
+    HEX_DBL(-, 1, 0000000000001, +, 31),
+    HEX_DBL(-, 1, fffffffffffff, +, 30),
+    -1000.0,
+    -3.5,
+    HEX_DBL(-, 1, 8000000000001, +, 1),
+    -2.5,
+    HEX_DBL(-, 1, 8000000000001, +, 0),
+    -1.5,
+    -0.5,
+    -0.25,
+    HEX_DBL(-, 1, fffffffffffff, -, 3),
+    -std::numeric_limits<double>::min(),
+    HEX_DBL(-, 0, fffffffffffff, -, 1022),
+    HEX_DBL(-, 0, 0000000000001, -, 1022),
+    -0.0,
+
+    std::numeric_limits<double>::quiet_NaN(),
+    std::numeric_limits<double>::infinity(),
+    std::numeric_limits<double>::max(),
+    HEX_DBL(+, 1, 0, +, 64),
+    HEX_DBL(+, 1, 0000000000001, +, 63),
+    HEX_DBL(+, 1, 000002, +, 32),
+    HEX_DBL(+, 1, fffffffffffff, +, 31),
+    HEX_DBL(+, 1, 0, +, 31),
+    HEX_DBL(+, 1, fffffffffffff, +, 30),
+    +100.0,
+    +3.0,
+    HEX_DBL(+, 1, 7ffffffffffff, +, 1),
+    +2.0,
+    HEX_DBL(+, 1, 7ffffffffffff, +, 0),
+    +1.0,
+    HEX_DBL(+, 1, fffffffffffff, -, 2),
+    +std::numeric_limits<double>::min(),
+    HEX_DBL(+, 0, 0000000000fff, -, 1022),
+    HEX_DBL(+, 0, 0000000000007, -, 1022),
+    +0.0,
+};
+
+inline constexpr std::array kSpecialValuesReducedFloat{
+    -std::numeric_limits<float>::quiet_NaN(),
+    -std::numeric_limits<float>::infinity(),
+    -std::numeric_limits<float>::max(),
+    HEX_FLT(-, 1, 000002, +, 64),
+    HEX_FLT(-, 1, fffffe, +, 63),
+    HEX_FLT(-, 1, fffffe, +, 62),
+    HEX_FLT(-, 1, 0, +, 32),
+    HEX_FLT(-, 1, fffffe, +, 31),
+    HEX_FLT(-, 1, fffffe, +, 30),
+    -1000.f,
+    -3.5f,
+    HEX_FLT(-, 1, 800002, +, 1),
+    -2.5f,
+    HEX_FLT(-, 1, 800002, +, 0),
+    -1.5f,
+    -0.5f,
+    -0.25f,
+    HEX_FLT(-, 1, fffffe, -, 3),
+    -std::numeric_limits<float>::min(),
+    HEX_FLT(-, 0, fffffe, -, 126),
+    HEX_FLT(-, 0, 000002, -, 126),
+    -0.0f,
+
+    std::numeric_limits<float>::quiet_NaN(),
+    std::numeric_limits<float>::infinity(),
+    std::numeric_limits<float>::max(),
+    HEX_FLT(+, 1, 0, +, 64),
+    HEX_FLT(+, 1, 000002, +, 63),
+    HEX_FLT(+, 1, 000002, +, 32),
+    HEX_FLT(+, 1, 000002, +, 31),
+    HEX_FLT(+, 1, fffffe, +, 30),
+    +100.f,
+    +4.0f,
+    HEX_FLT(+, 1, 7ffffe, +, 1),
+    +2.0f,
+    HEX_FLT(+, 1, 7ffffe, +, 0),
+    +1.0f,
+    HEX_FLT(+, 1, fffffe, -, 2),
+    +std::numeric_limits<float>::min(),
+    HEX_FLT(+, 0, 000ffe, -, 126),
+    HEX_FLT(+, 0, 000006, -, 126),
+    +0.0f,
+};
+
+inline constexpr auto kSpecialValReducedRegistry = std::make_tuple(
+    SpecialVals<float>{kSpecialValuesReducedFloat.data(), kSpecialValuesReducedFloat.size()},
+    SpecialVals<double>{kSpecialValuesReducedDouble.data(), kSpecialValuesReducedDouble.size()});
+
+template <typename T, typename TArg, typename RT, typename RTArg, typename ValidatorBuilder>
+void QuaternaryFloatingPointBruteForceTest(kernel_sig<T, TArg, TArg, TArg, TArg> kernel,
+                                           ref_sig<RT, RTArg, RTArg, RTArg, RTArg> ref_func,
+                                           const ValidatorBuilder& validator_builder,
+                                           const TArg a = std::numeric_limits<TArg>::lowest(),
+                                           const TArg b = std::numeric_limits<TArg>::max()) {
+  const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
+  const uint64_t num_iterations = GetTestIterationCount();
+  const auto max_batch_size =
+      std::min(GetMaxAllowedDeviceMemoryUsage() / (sizeof(TArg) * 4 + sizeof(T)), num_iterations);
+  LinearAllocGuard<TArg> x1s{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(TArg)};
+  LinearAllocGuard<TArg> x2s{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(TArg)};
+  LinearAllocGuard<TArg> x3s{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(TArg)};
+  LinearAllocGuard<TArg> x4s{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(TArg)};
+
+  MathTest math_test(kernel, max_batch_size);
+
+  auto batch_size = max_batch_size;
+  const auto num_threads = thread_pool.thread_count();
+  for (uint64_t i = 0ul; i < num_iterations; i += batch_size) {
+    batch_size = std::min<uint64_t>(max_batch_size, num_iterations - i);
+
+    const auto min_sub_batch_size = batch_size / num_threads;
+    const auto tail = batch_size % num_threads;
+
+    auto base_idx = 0u;
+    for (auto i = 0u; i < num_threads; ++i) {
+      const auto sub_batch_size = min_sub_batch_size + (i < tail);
+      thread_pool.Post([=, &x1s, &x2s, &x3s, &x4s] {
+        const auto generator = [=] {
+          static thread_local std::mt19937 rng(std::random_device{}());
+          std::uniform_real_distribution<RefType_t<TArg>> unif_dist(a, b);
+          return static_cast<TArg>(unif_dist(rng));
+        };
+        std::generate(x1s.ptr() + base_idx, x1s.ptr() + base_idx + sub_batch_size, generator);
+        std::generate(x2s.ptr() + base_idx, x2s.ptr() + base_idx + sub_batch_size, generator);
+        std::generate(x3s.ptr() + base_idx, x3s.ptr() + base_idx + sub_batch_size, generator);
+        std::generate(x4s.ptr() + base_idx, x4s.ptr() + base_idx + sub_batch_size, generator);
+      });
+      base_idx += sub_batch_size;
+    }
+
+    thread_pool.Wait();
+
+    math_test.Run(validator_builder, grid_size, block_size, ref_func, batch_size, x1s.ptr(),
+                  x2s.ptr(), x3s.ptr(), x4s.ptr());
+  }
+}
+
+template <typename T, typename TArg, typename RT, typename RTArg, typename ValidatorBuilder>
+void QuaternaryFloatingPointSpecialValuesTest(kernel_sig<T, TArg, TArg, TArg, TArg> kernel,
+                                              ref_sig<RT, RTArg, RTArg, RTArg, RTArg> ref_func,
+                                              const ValidatorBuilder& validator_builder) {
+  const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
+  const auto values = std::get<SpecialVals<TArg>>(kSpecialValReducedRegistry);
+
+  const auto size = values.size * values.size * values.size * values.size;
+  LinearAllocGuard<TArg> x1s{LinearAllocs::hipHostMalloc, size * sizeof(TArg)};
+  LinearAllocGuard<TArg> x2s{LinearAllocs::hipHostMalloc, size * sizeof(TArg)};
+  LinearAllocGuard<TArg> x3s{LinearAllocs::hipHostMalloc, size * sizeof(TArg)};
+  LinearAllocGuard<TArg> x4s{LinearAllocs::hipHostMalloc, size * sizeof(TArg)};
+
+  for (auto i = 0u; i < values.size; ++i) {
+    for (auto j = 0u; j < values.size; ++j) {
+      for (auto k = 0u; k < values.size; ++k) {
+        for (auto l = 0u; l < values.size; ++l) {
+          x1s.ptr()[((i * values.size + j) * values.size + k) * values.size + l] = values.data[i];
+          x2s.ptr()[((i * values.size + j) * values.size + k) * values.size + l] = values.data[j];
+          x3s.ptr()[((i * values.size + j) * values.size + k) * values.size + l] = values.data[k];
+          x4s.ptr()[((i * values.size + j) * values.size + k) * values.size + l] = values.data[l];
+        }
+      }
+    }
+  }
+
+  MathTest math_test(kernel, size);
+  math_test.template Run<false>(validator_builder, grid_size, block_size, ref_func, size, x1s.ptr(),
+                                x2s.ptr(), x3s.ptr(), x4s.ptr());
+}
+
+template <typename T, typename TArg, typename RT, typename RTArg, typename ValidatorBuilder>
+void QuaternaryFloatingPointTest(kernel_sig<T, TArg, TArg, TArg, TArg> kernel,
+                                 ref_sig<RT, RTArg, RTArg, RTArg, RTArg> ref_func,
+                                 const ValidatorBuilder& validator_builder) {
+  SECTION("Special values") {
+    QuaternaryFloatingPointSpecialValuesTest(kernel, ref_func, validator_builder);
+  }
+
+  SECTION("Brute force") {
+    QuaternaryFloatingPointBruteForceTest(kernel, ref_func, validator_builder);
+  }
+}
+
+
+#define MATH_QUATERNARY_WITHIN_ULP_TEST_DEF(kern_name, ref_func, sp_ulp, dp_ulp)                   \
+  MATH_QUATERNARY_KERNEL_DEF(kern_name)                                                            \
+                                                                                                   \
+  TEMPLATE_TEST_CASE("Unit_Device_" #kern_name "_Accuracy_Positive", "", float, double) {          \
+    using RT = RefType_t<TestType>;                                                                \
+    RT (*ref)(RT, RT, RT, RT) = ref_func;                                                          \
+    const auto ulp = std::is_same_v<float, TestType> ? sp_ulp : dp_ulp;                            \
+                                                                                                   \
+    QuaternaryFloatingPointTest(kern_name##_kernel<TestType>, ref,                                 \
+                                ULPValidatorBuilderFactory<TestType>(ulp));                        \
+  }
@@ -0,0 +1,142 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include "math_common.hh"
+#include "math_special_values.hh"
+
+#include <hip/hip_cooperative_groups.h>
+
+namespace cg = cooperative_groups;
+
+#define MATH_TERNARY_KERNEL_DEF(func_name)                                                         \
+  template <typename T>                                                                            \
+  __global__ void func_name##_kernel(T* const ys, const size_t num_xs, T* const x1s, T* const x2s, \
+                                     T* const x3s) {                                               \
+    const auto tid = cg::this_grid().thread_rank();                                                \
+    const auto stride = cg::this_grid().size();                                                    \
+                                                                                                   \
+    for (auto i = tid; i < num_xs; i += stride) {                                                  \
+      if constexpr (std::is_same_v<float, T>) {                                                    \
+        ys[i] = func_name##f(x1s[i], x2s[i], x3s[i]);                                              \
+      } else if constexpr (std::is_same_v<double, T>) {                                            \
+        ys[i] = func_name(x1s[i], x2s[i], x3s[i]);                                                 \
+      }                                                                                            \
+    }                                                                                              \
+  }
+
+template <typename T, typename TArg, typename RT, typename RTArg, typename ValidatorBuilder>
+void TernaryFloatingPointBruteForceTest(kernel_sig<T, TArg, TArg, TArg> kernel,
+                                       ref_sig<RT, RTArg, RTArg, RTArg> ref_func,
+                                       const ValidatorBuilder& validator_builder,
+                                       const TArg a = std::numeric_limits<TArg>::lowest(),
+                                       const TArg b = std::numeric_limits<TArg>::max()) {
+  const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
+  const uint64_t num_iterations = GetTestIterationCount();
+  const auto max_batch_size =
+      std::min(GetMaxAllowedDeviceMemoryUsage() / (sizeof(TArg) * 3 + sizeof(T)), num_iterations);
+  LinearAllocGuard<TArg> x1s{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(TArg)};
+  LinearAllocGuard<TArg> x2s{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(TArg)};
+  LinearAllocGuard<TArg> x3s{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(TArg)};
+
+  MathTest math_test(kernel, max_batch_size);
+
+  auto batch_size = max_batch_size;
+  const auto num_threads = thread_pool.thread_count();
+  for (uint64_t i = 0ul; i < num_iterations; i += batch_size) {
+    batch_size = std::min<uint64_t>(max_batch_size, num_iterations - i);
+
+    const auto min_sub_batch_size = batch_size / num_threads;
+    const auto tail = batch_size % num_threads;
+
+    auto base_idx = 0u;
+    for (auto i = 0u; i < num_threads; ++i) {
+      const auto sub_batch_size = min_sub_batch_size + (i < tail);
+      thread_pool.Post([=, &x1s, &x2s, &x3s] {
+        const auto generator = [=] {
+          static thread_local std::mt19937 rng(std::random_device{}());
+          std::uniform_real_distribution<RefType_t<TArg>> unif_dist(a, b);
+          return static_cast<TArg>(unif_dist(rng));
+        };
+        std::generate(x1s.ptr() + base_idx, x1s.ptr() + base_idx + sub_batch_size, generator);
+        std::generate(x2s.ptr() + base_idx, x2s.ptr() + base_idx + sub_batch_size, generator);
+        std::generate(x3s.ptr() + base_idx, x3s.ptr() + base_idx + sub_batch_size, generator);
+      });
+      base_idx += sub_batch_size;
+    }
+
+    thread_pool.Wait();
+
+    math_test.Run(validator_builder, grid_size, block_size, ref_func, batch_size, x1s.ptr(),
+                  x2s.ptr(), x3s.ptr());
+  }
+}
+
+template <typename T, typename TArg, typename RT, typename RTArg, typename ValidatorBuilder>
+void TernaryFloatingPointSpecialValuesTest(kernel_sig<T, TArg, TArg, TArg> kernel,
+                                          ref_sig<RT, RTArg, RTArg, RTArg> ref_func,
+                                          const ValidatorBuilder& validator_builder) {
+  const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
+  const auto values = std::get<SpecialVals<TArg>>(kSpecialValRegistry);
+
+  const auto size = values.size * values.size * values.size;
+  LinearAllocGuard<TArg> x1s{LinearAllocs::hipHostMalloc, size * sizeof(TArg)};
+  LinearAllocGuard<TArg> x2s{LinearAllocs::hipHostMalloc, size * sizeof(TArg)};
+  LinearAllocGuard<TArg> x3s{LinearAllocs::hipHostMalloc, size * sizeof(TArg)};
+
+  for (auto i = 0u; i < values.size; ++i) {
+    for (auto j = 0u; j < values.size; ++j) {
+      for (auto k = 0u; k < values.size; ++k) {
+        x1s.ptr()[(i * values.size + j) * values.size + k] = values.data[i];
+        x2s.ptr()[(i * values.size + j) * values.size + k] = values.data[j];
+        x3s.ptr()[(i * values.size + j) * values.size + k] = values.data[k];
+      }
+    }
+  }
+
+  MathTest math_test(kernel, size);
+  math_test.template Run<false>(validator_builder, grid_size, block_size, ref_func, size, x1s.ptr(),
+                                x2s.ptr(), x3s.ptr());
+}
+
+template <typename T, typename TArg, typename RT, typename RTArg, typename ValidatorBuilder>
+void TernaryFloatingPointTest(kernel_sig<T, TArg, TArg, TArg> kernel, ref_sig<RT, RTArg, RTArg, RTArg> ref_func,
+                             const ValidatorBuilder& validator_builder) {
+  SECTION("Special values") {
+    TernaryFloatingPointSpecialValuesTest(kernel, ref_func, validator_builder);
+  }
+
+  SECTION("Brute force") { TernaryFloatingPointBruteForceTest(kernel, ref_func, validator_builder); }
+}
+
+
+#define MATH_TERNARY_WITHIN_ULP_TEST_DEF(kern_name, ref_func, sp_ulp, dp_ulp)                      \
+  MATH_TERNARY_KERNEL_DEF(kern_name)                                                               \
+                                                                                                   \
+  TEMPLATE_TEST_CASE("Unit_Device_" #kern_name "_Accuracy_Positive", "", float, double) {          \
+    using RT = RefType_t<TestType>;                                                                \
+    RT (*ref)(RT, RT, RT) = ref_func;                                                              \
+    const auto ulp = std::is_same_v<float, TestType> ? sp_ulp : dp_ulp;                            \
+                                                                                                   \
+    TernaryFloatingPointTest(kern_name##_kernel<TestType>, ref,                                    \
+                            ULPValidatorBuilderFactory<TestType>(ulp));                            \
+  }
@@ -0,0 +1,64 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <atomic>
+#include <thread>
+
+#include <boost/asio/post.hpp>
+#include <boost/asio/thread_pool.hpp>
+
+// This is a simple wrapper around boost::asio::thread_pool that keeps track of the number of
+// currently active tasks using an atomic counter.
+class ThreadPool {
+ public:
+  ThreadPool(size_t thread_count = std::thread::hardware_concurrency())
+      : thread_count_(thread_count) {}
+
+  ~ThreadPool() { thread_pool_.join(); }
+
+  // Submits a task to the thread pool and increments the number of active tasks. The task is
+  // wrapped in a lambda that decrements the number of active tasks upon completion.
+  template <typename T> void Post(T&& task) {
+    ++active_tasks_;
+    auto&& task_wrapper = [task, this] {
+      task();
+      --active_tasks_;
+    };
+    boost::asio::post(thread_pool_, task_wrapper);
+  }
+
+  // Busy waits for the number of active tasks to reach zero.
+  void Wait() const {
+    while (active_tasks_.load(std::memory_order_relaxed))
+      ;
+  }
+
+  size_t thread_count() const { return thread_count_; }
+
+ private:
+  const size_t thread_count_;
+  boost::asio::thread_pool thread_pool_{thread_count_};
+  std::atomic<size_t> active_tasks_;
+};
+
+inline ThreadPool thread_pool{};
@@ -0,0 +1,108 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+
+class Dummy {
+ public:
+  __device__ Dummy() {}
+  __device__ ~Dummy() {}
+};
+
+#define TRIG_DP_UNARY_NEGATIVE_KERNELS(func_name)                                                  \
+  __global__ void func_name##_kernel_v1(double* x) { double result = func_name(x); }               \
+  __global__ void func_name##_kernel_v2(Dummy x) { double result = func_name(x); }
+
+/*Expecting 2 errors per macro invocation - 26 total*/
+TRIG_DP_UNARY_NEGATIVE_KERNELS(sin)
+TRIG_DP_UNARY_NEGATIVE_KERNELS(cos)
+TRIG_DP_UNARY_NEGATIVE_KERNELS(tan)
+TRIG_DP_UNARY_NEGATIVE_KERNELS(asin)
+TRIG_DP_UNARY_NEGATIVE_KERNELS(acos)
+TRIG_DP_UNARY_NEGATIVE_KERNELS(atan)
+TRIG_DP_UNARY_NEGATIVE_KERNELS(sinh)
+TRIG_DP_UNARY_NEGATIVE_KERNELS(cosh)
+TRIG_DP_UNARY_NEGATIVE_KERNELS(tanh)
+TRIG_DP_UNARY_NEGATIVE_KERNELS(asinh)
+TRIG_DP_UNARY_NEGATIVE_KERNELS(atanh)
+TRIG_DP_UNARY_NEGATIVE_KERNELS(sinpi)
+TRIG_DP_UNARY_NEGATIVE_KERNELS(cospi)
+
+/*Expecting 4 errors*/
+__global__ void atan2_kernel_v1(double* x, double y) { double result = atan2(x, y); }
+__global__ void atan2_kernel_v2(double x, double* y) { double result = atan2(x, y); }
+__global__ void atan2_kernel_v3(Dummy x, double y) { double result = atan2(x, y); }
+__global__ void atan2_kernel_v4(double x, Dummy y) { double result = atan2(x, y); }
+
+/*Expecting 18 errors*/
+__global__ void sincos_kernel_v1(double* x, double* sptr, double* cptr) { sincos(x, sptr, cptr); }
+__global__ void sincos_kernel_v2(Dummy x, double* sptr, double* cptr) { sincos(x, sptr, cptr); }
+__global__ void sincos_kernel_v3(double x, char* sptr, double* cptr) { sincos(x, sptr, cptr); }
+__global__ void sincos_kernel_v4(double x, short* sptr, double* cptr) { sincos(x, sptr, cptr); }
+__global__ void sincos_kernel_v5(double x, int* sptr, double* cptr) { sincos(x, sptr, cptr); }
+__global__ void sincos_kernel_v6(double x, long* sptr, double* cptr) { sincos(x, sptr, cptr); }
+__global__ void sincos_kernel_v7(double x, long long* sptr, double* cptr) { sincos(x, sptr, cptr); }
+__global__ void sincos_kernel_v8(double x, float* sptr, double* cptr) { sincos(x, sptr, cptr); }
+__global__ void sincos_kernel_v9(double x, Dummy* sptr, double* cptr) { sincos(x, sptr, cptr); }
+__global__ void sincos_kernel_v10(double x, const double* sptr, double* cptr) {
+  sincos(x, sptr, cptr);
+}
+__global__ void sincos_kernel_v11(double x, double* sptr, char* cptr) { sincos(x, sptr, cptr); }
+__global__ void sincos_kernel_v12(double x, double* sptr, short* cptr) { sincos(x, sptr, cptr); }
+__global__ void sincos_kernel_v13(double x, double* sptr, int* cptr) { sincos(x, sptr, cptr); }
+__global__ void sincos_kernel_v14(double x, double* sptr, long* cptr) { sincos(x, sptr, cptr); }
+__global__ void sincos_kernel_v15(double x, double* sptr, long long* cptr) {
+  sincos(x, sptr, cptr);
+}
+__global__ void sincos_kernel_v16(double x, double* sptr, float* cptr) { sincos(x, sptr, cptr); }
+__global__ void sincos_kernel_v17(double x, double* sptr, Dummy* cptr) { sincos(x, sptr, cptr); }
+__global__ void sincos_kernel_v18(double x, double* sptr, const double* cptr) {
+  sincos(x, sptr, cptr);
+}
+
+/*Expecting 18 errors*/
+__global__ void sincospi_kernel_v1(float* x, float* sptr, float* cptr) { sincospi(x, sptr, cptr); }
+__global__ void sincospi_kernel_v2(Dummy x, float* sptr, float* cptr) { sincospi(x, sptr, cptr); }
+__global__ void sincospi_kernel_v3(float x, char* sptr, float* cptr) { sincospi(x, sptr, cptr); }
+__global__ void sincospi_kernel_v4(float x, short* sptr, float* cptr) { sincospi(x, sptr, cptr); }
+__global__ void sincospi_kernel_v5(float x, int* sptr, float* cptr) { sincospi(x, sptr, cptr); }
+__global__ void sincospi_kernel_v6(float x, long* sptr, float* cptr) { sincospi(x, sptr, cptr); }
+__global__ void sincospi_kernel_v7(float x, long long* sptr, float* cptr) {
+  sincospi(x, sptr, cptr);
+}
+__global__ void sincospi_kernel_v8(float x, double* sptr, float* cptr) { sincospi(x, sptr, cptr); }
+__global__ void sincospi_kernel_v9(float x, Dummy* sptr, float* cptr) { sincospi(x, sptr, cptr); }
+__global__ void sincospi_kernel_v10(float x, const float* sptr, float* cptr) {
+  sincospi(x, sptr, cptr);
+}
+__global__ void sincospi_kernel_v11(float x, float* sptr, char* cptr) { sincospi(x, sptr, cptr); }
+__global__ void sincospi_kernel_v12(float x, float* sptr, short* cptr) { sincospi(x, sptr, cptr); }
+__global__ void sincospi_kernel_v13(float x, float* sptr, int* cptr) { sincospi(x, sptr, cptr); }
+__global__ void sincospi_kernel_v14(float x, float* sptr, long* cptr) { sincospi(x, sptr, cptr); }
+__global__ void sincospi_kernel_v15(float x, float* sptr, long long* cptr) {
+  sincospi(x, sptr, cptr);
+}
+__global__ void sincospi_kernel_v16(float x, float* sptr, double* cptr) { sincospi(x, sptr, cptr); }
+__global__ void sincospi_kernel_v17(float x, float* sptr, Dummy* cptr) { sincospi(x, sptr, cptr); }
+__global__ void sincospi_kernel_v18(float x, float* sptr, const float* cptr) {
+  sincospi(x, sptr, cptr);
+}
@@ -0,0 +1,137 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "trig_negative_kernels_rtc.hh"
+
+#include "unary_common.hh"
+#include "binary_common.hh"
+
+#include <boost/math/special_functions.hpp>
+
+
+MATH_UNARY_WITHIN_ULP_TEST_DEF(sin, std::sin, 2, 2);
+TEST_CASE("Unit_Device_sin_sinf_Negative_RTC") { NegativeTestRTCWrapper<4>(kSin); }
+
+MATH_UNARY_WITHIN_ULP_TEST_DEF(cos, std::cos, 2, 2)
+TEST_CASE("Unit_Device_cos_cosf_Negative_RTC") { NegativeTestRTCWrapper<4>(kCos); }
+
+MATH_UNARY_WITHIN_ULP_TEST_DEF(tan, std::tan, 4, 2)
+TEST_CASE("Unit_Device_tan_tanf_Negative_RTC") { NegativeTestRTCWrapper<4>(kTan); }
+
+MATH_UNARY_WITHIN_ULP_TEST_DEF(asin, std::asin, 2, 2)
+TEST_CASE("Unit_Device_asin_asinf_Negative_RTC") { NegativeTestRTCWrapper<4>(kAsin); }
+
+MATH_UNARY_WITHIN_ULP_TEST_DEF(acos, std::acos, 2, 2)
+TEST_CASE("Unit_Device_acos_acosf_Negative_RTC") { NegativeTestRTCWrapper<4>(kAcos); }
+
+MATH_UNARY_WITHIN_ULP_TEST_DEF(atan, std::atan, 2, 2)
+TEST_CASE("Unit_Device_atan_atanf_Negative_RTC") { NegativeTestRTCWrapper<4>(kAtan); }
+
+MATH_UNARY_WITHIN_ULP_TEST_DEF(sinh, std::sinh, 3, 2)
+TEST_CASE("Unit_Device_sinh_sinhf_Negative_RTC") { NegativeTestRTCWrapper<4>(kSinh); }
+
+MATH_UNARY_WITHIN_ULP_TEST_DEF(cosh, std::cosh, 2, 1)
+TEST_CASE("Unit_Device_cosh_coshf_Negative_RTC") { NegativeTestRTCWrapper<4>(kCosh); }
+
+MATH_UNARY_WITHIN_ULP_TEST_DEF(tanh, std::tanh, 2, 1)
+TEST_CASE("Unit_Device_tanh_tanhf_Negative_RTC") { NegativeTestRTCWrapper<4>(kTanh); }
+
+MATH_UNARY_WITHIN_ULP_TEST_DEF(asinh, std::asinh, 3, 2)
+TEST_CASE("Unit_Device_asinh_asinhf_Negative_RTC") { NegativeTestRTCWrapper<4>(kAsinh); }
+
+MATH_UNARY_WITHIN_ULP_TEST_DEF(acosh, std::acosh, 4, 2)
+TEST_CASE("Unit_Device_acosh_acoshf_Negative_RTC") { NegativeTestRTCWrapper<4>(kAcosh); }
+
+MATH_UNARY_WITHIN_ULP_TEST_DEF(atanh, std::atanh, 3, 2)
+TEST_CASE("Unit_Device_atanh_atanhf_Negative_RTC") { NegativeTestRTCWrapper<4>(kAtanh); }
+
+MATH_UNARY_WITHIN_ULP_TEST_DEF(sinpi, boost::math::sin_pi, 2, 2);
+TEST_CASE("Unit_Device_sinpi_sinpif_Negative_RTC") { NegativeTestRTCWrapper<4>(kSinpi); }
+
+MATH_UNARY_WITHIN_ULP_TEST_DEF(cospi, boost::math::cos_pi, 2, 2);
+TEST_CASE("Unit_Device_cospi_cospif_Negative_RTC") { NegativeTestRTCWrapper<4>(kCospi); }
+
+MATH_BINARY_WITHIN_ULP_TEST_DEF(atan2, std::atan2, 3, 2);
+TEST_CASE("Unit_Device_atan2_atan2f_Negative_RTC") { NegativeTestRTCWrapper<8>(kAtan2); }
+
+
+template <typename T>
+__global__ void sincos_kernel(std::pair<T, T>* const ys, const size_t num_xs, T* const xs) {
+  const auto tid = cg::this_grid().thread_rank();
+  const auto stride = cg::this_grid().size();
+
+  for (auto i = tid; i < num_xs; i += stride) {
+    if constexpr (std::is_same_v<float, T>) {
+      sincosf(xs[i], &ys[i].first, &ys[i].second);
+    } else if constexpr (std::is_same_v<double, T>) {
+      sincos(xs[i], &ys[i].first, &ys[i].second);
+    }
+  }
+}
+
+template <typename T> std::pair<T, T> sincos(T x) { return {std::sin(x), std::cos(x)}; }
+
+TEST_CASE("Unit_Device_sincos_Accuracy_Positive - float") {
+  UnarySinglePrecisionTest(
+      sincos_kernel<float>, sincos<double>,
+      PairValidatorBuilderFactory<float>(ULPValidatorBuilderFactory<float>(2)));
+}
+
+TEST_CASE("Unit_Device_sincos_Accuracy_Positive - double") {
+  const auto validator_builder =
+      PairValidatorBuilderFactory<double>(ULPValidatorBuilderFactory<double>(2));
+  UnaryDoublePrecisionTest(sincos_kernel<double>, sincos<long double>, validator_builder);
+}
+
+TEST_CASE("Unit_Device_sincos_sincosf_Negative_RTC") { NegativeTestRTCWrapper<36>(kSincos); }
+
+
+template <typename T>
+__global__ void sincospi_kernel(std::pair<T, T>* const ys, const size_t num_xs, T* const xs) {
+  const auto tid = cg::this_grid().thread_rank();
+  const auto stride = cg::this_grid().size();
+
+  for (auto i = tid; i < num_xs; i += stride) {
+    if constexpr (std::is_same_v<float, T>) {
+      sincospif(xs[i], &ys[i].first, &ys[i].second);
+    } else if constexpr (std::is_same_v<double, T>) {
+      sincospi(xs[i], &ys[i].first, &ys[i].second);
+    }
+  }
+}
+
+template <typename T> std::pair<T, T> sincospi(T x) {
+  return {boost::math::sin_pi(x), boost::math::cos_pi(x)};
+}
+
+TEST_CASE("Unit_Device_sincospi_Accuracy_Positive - float") {
+  UnarySinglePrecisionTest(
+      sincospi_kernel<float>, sincospi<double>,
+      PairValidatorBuilderFactory<float>(ULPValidatorBuilderFactory<float>(2)));
+}
+
+TEST_CASE("Unit_Device_sincospi_Accuracy_Positive - double") {
+  const auto validator_builder =
+      PairValidatorBuilderFactory<double>(ULPValidatorBuilderFactory<double>(2));
+  UnaryDoublePrecisionTest(sincospi_kernel<double>, sincospi<long double>, validator_builder);
+}
+
+TEST_CASE("Unit_Device_sincospi_sincospif_Negative_RTC") { NegativeTestRTCWrapper<36>(kSincospi); }
@@ -0,0 +1,320 @@
+// #define TRIG_UNARY_NEGATIVE_KERNELS(func_name)
+// class Dummy {
+//  public:
+//   __device__ Dummy() {}
+//   __device__ ~Dummy() {}
+// };
+// __global__ void func_name##f_kernel_v1(float* x) { float result = func_name##f(x); }
+// __global__ void func_name##f_kernel_v2(Dummy x) { float result = func_name##f(x); }
+// __global__ void func_name##_kernel_v1(double* x) { double result = func_name(x); }
+// __global__ void func_name##_kernel_v2(Dummy x) { double result = func_name(x); }
+
+static constexpr auto kSin{R"(
+  class Dummy {                                                                                    
+   public:                                                                                         
+    __device__ Dummy() {}                                                                          
+    __device__ ~Dummy() {}                                                                         
+  };                                                                                               
+  __global__ void sinf_kernel_v1(float* x) { float result = sinf(x); }             
+  __global__ void sinf_kernel_v2(Dummy x) { float result = sinf(x); }              
+  __global__ void sin_kernel_v1(double* x) { double result = sin(x); }               
+  __global__ void sin_kernel_v2(Dummy x) { double result = sin(x); }
+  )"};
+
+static constexpr auto kCos{R"(
+  class Dummy {                                                                                    
+   public:                                                                                         
+    __device__ Dummy() {}                                                                          
+    __device__ ~Dummy() {}                                                                         
+  };                                                                                               
+  __global__ void cosf_kernel_v1(float* x) { float result = cosf(x); }             
+  __global__ void cosf_kernel_v2(Dummy x) { float result = cosf(x); }              
+  __global__ void cos_kernel_v1(double* x) { double result = cos(x); }               
+  __global__ void cos_kernel_v2(Dummy x) { double result = cos(x); }
+  )"};
+
+static constexpr auto kTan{R"(
+  class Dummy {                                                                                    
+   public:                                                                                         
+    __device__ Dummy() {}                                                                          
+    __device__ ~Dummy() {}                                                                         
+  };                                                                                               
+  __global__ void tanf_kernel_v1(float* x) { float result = tanf(x); }             
+  __global__ void tanf_kernel_v2(Dummy x) { float result = tanf(x); }              
+  __global__ void tan_kernel_v1(double* x) { double result = tan(x); }               
+  __global__ void tan_kernel_v2(Dummy x) { double result = tan(x); }
+  )"};
+
+static constexpr auto kAsin{R"(
+  class Dummy {                                                                                    
+   public:                                                                                         
+    __device__ Dummy() {}                                                                          
+    __device__ ~Dummy() {}                                                                         
+  };                                                                                               
+  __global__ void asinf_kernel_v1(float* x) { float result = asinf(x); }             
+  __global__ void asinf_kernel_v2(Dummy x) { float result = asinf(x); }              
+  __global__ void asin_kernel_v1(double* x) { double result = asin(x); }               
+  __global__ void asin_kernel_v2(Dummy x) { double result = asin(x); }
+  )"};
+
+static constexpr auto kAcos{R"(
+  class Dummy {                                                                                    
+   public:                                                                                         
+    __device__ Dummy() {}                                                                          
+    __device__ ~Dummy() {}                                                                         
+  };                                                                                               
+  __global__ void acosf_kernel_v1(float* x) { float result = acosf(x); }             
+  __global__ void acosf_kernel_v2(Dummy x) { float result = acosf(x); }              
+  __global__ void acos_kernel_v1(double* x) { double result = acos(x); }               
+  __global__ void acos_kernel_v2(Dummy x) { double result = acos(x); }
+  )"};
+
+static constexpr auto kAtan{R"(
+  class Dummy {                                                                                    
+   public:                                                                                         
+    __device__ Dummy() {}                                                                          
+    __device__ ~Dummy() {}                                                                         
+  };                                                                                               
+  __global__ void atanf_kernel_v1(float* x) { float result = atanf(x); }             
+  __global__ void atanf_kernel_v2(Dummy x) { float result = atanf(x); }              
+  __global__ void atan_kernel_v1(double* x) { double result = atan(x); }               
+  __global__ void atan_kernel_v2(Dummy x) { double result = atan(x); }
+  )"};
+
+static constexpr auto kSinh{R"(
+  class Dummy {                                                                                    
+   public:                                                                                         
+    __device__ Dummy() {}                                                                          
+    __device__ ~Dummy() {}                                                                         
+  };                                                                                               
+  __global__ void sinhf_kernel_v1(float* x) { float result = sinhf(x); }             
+  __global__ void sinhf_kernel_v2(Dummy x) { float result = sinhf(x); }              
+  __global__ void sinh_kernel_v1(double* x) { double result = sinh(x); }               
+  __global__ void sinh_kernel_v2(Dummy x) { double result = sinh(x); }
+  )"};
+
+static constexpr auto kCosh{R"(
+  class Dummy {                                                                                    
+   public:                                                                                         
+    __device__ Dummy() {}                                                                          
+    __device__ ~Dummy() {}                                                                         
+  };                                                                                               
+  __global__ void coshf_kernel_v1(float* x) { float result = coshf(x); }             
+  __global__ void coshf_kernel_v2(Dummy x) { float result = coshf(x); }              
+  __global__ void cosh_kernel_v1(double* x) { double result = cosh(x); }               
+  __global__ void cosh_kernel_v2(Dummy x) { double result = cosh(x); }
+  )"};
+
+static constexpr auto kTanh{R"(
+  class Dummy {                                                                                    
+   public:                                                                                         
+    __device__ Dummy() {}                                                                          
+    __device__ ~Dummy() {}                                                                         
+  };                                                                                               
+  __global__ void tanhf_kernel_v1(float* x) { float result = tanhf(x); }             
+  __global__ void tanhf_kernel_v2(Dummy x) { float result = tanhf(x); }              
+  __global__ void tanh_kernel_v1(double* x) { double result = tanh(x); }               
+  __global__ void tanh_kernel_v2(Dummy x) { double result = tanh(x); }
+  )"};
+
+static constexpr auto kAsinh{R"(
+  class Dummy {                                                                                    
+   public:                                                                                         
+    __device__ Dummy() {}                                                                          
+    __device__ ~Dummy() {}                                                                         
+  };                                                                                               
+  __global__ void asinhf_kernel_v1(float* x) { float result = asinhf(x); }             
+  __global__ void asinhf_kernel_v2(Dummy x) { float result = asinhf(x); }              
+  __global__ void asinh_kernel_v1(double* x) { double result = asinh(x); }               
+  __global__ void asinh_kernel_v2(Dummy x) { double result = asinh(x); }
+  )"};
+
+static constexpr auto kAcosh{R"(
+  class Dummy {                                                                                    
+   public:                                                                                         
+    __device__ Dummy() {}                                                                          
+    __device__ ~Dummy() {}                                                                         
+  };                                                                                               
+  __global__ void acoshf_kernel_v1(float* x) { float result = acoshf(x); }             
+  __global__ void acoshf_kernel_v2(Dummy x) { float result = acoshf(x); }              
+  __global__ void acosh_kernel_v1(double* x) { double result = acosh(x); }               
+  __global__ void acosh_kernel_v2(Dummy x) { double result = acosh(x); }
+  )"};
+
+static constexpr auto kAtanh{R"(
+  class Dummy {                                                                                    
+   public:                                                                                         
+    __device__ Dummy() {}                                                                          
+    __device__ ~Dummy() {}                                                                         
+  };                                                                                               
+  __global__ void atanhf_kernel_v1(float* x) { float result = atanhf(x); }             
+  __global__ void atanhf_kernel_v2(Dummy x) { float result = atanhf(x); }              
+  __global__ void atanh_kernel_v1(double* x) { double result = atanh(x); }               
+  __global__ void atanh_kernel_v2(Dummy x) { double result = atanh(x); }
+  )"};
+
+static constexpr auto kSinpi{R"(
+  class Dummy {                                                                                    
+   public:                                                                                         
+    __device__ Dummy() {}                                                                          
+    __device__ ~Dummy() {}                                                                         
+  };                                                                                               
+  __global__ void sinpif_kernel_v1(float* x) { float result = sinpif(x); }             
+  __global__ void sinpif_kernel_v2(Dummy x) { float result = sinpif(x); }              
+  __global__ void sinpi_kernel_v1(double* x) { double result = sinpi(x); }               
+  __global__ void sinpi_kernel_v2(Dummy x) { double result = sinpi(x); }
+  )"};
+
+static constexpr auto kCospi{R"(
+  class Dummy {                                                                                    
+   public:                                                                                         
+    __device__ Dummy() {}                                                                          
+    __device__ ~Dummy() {}                                                                         
+  };                                                                                               
+  __global__ void cospif_kernel_v1(float* x) { float result = cospif(x); }             
+  __global__ void cospif_kernel_v2(Dummy x) { float result = cospif(x); }              
+  __global__ void cospi_kernel_v1(double* x) { double result = cospi(x); }               
+  __global__ void cospi_kernel_v2(Dummy x) { double result = cospi(x); }
+  )"};
+
+static constexpr auto kAtan2{R"(
+  class Dummy {                                                                                    
+   public:                                                                                         
+    __device__ Dummy() {}                                                                          
+    __device__ ~Dummy() {}                                                                         
+  };                                                                                               
+  __global__ void atan2f_kernel_v1(float* x, float y) { float result = atan2f(x, y); }
+  __global__ void atan2f_kernel_v2(float x, float* y) { float result = atan2f(x, y); }
+  __global__ void atan2f_kernel_v3(Dummy x, float y) { float result = atan2f(x, y); }
+  __global__ void atan2f_kernel_v4(float x, Dummy y) { float result = atan2f(x, y); }
+  __global__ void atan2_kernel_v1(double* x, double y) { double result = atan2(x, y); }
+  __global__ void atan2_kernel_v2(double x, double* y) { double result = atan2(x, y); }
+  __global__ void atan2_kernel_v3(Dummy x, double y) { double result = atan2(x, y); }
+  __global__ void atan2_kernel_v4(double x, Dummy y) { double result = atan2(x, y); }
+  )"};
+
+static constexpr auto kSincos{R"(
+  class Dummy {                                                                                    
+   public:                                                                                         
+    __device__ Dummy() {}                                                                          
+    __device__ ~Dummy() {}                                                                         
+  };                                                                                               
+  __global__ void sincosf_kernel_v1(float* x, float* sptr, float* cptr) { sincosf(x, sptr, cptr); }
+  __global__ void sincosf_kernel_v2(Dummy x, float* sptr, float* cptr) { sincosf(x, sptr, cptr); }
+  __global__ void sincosf_kernel_v3(float x, char* sptr, float* cptr) { sincosf(x, sptr, cptr); }
+  __global__ void sincosf_kernel_v4(float x, short* sptr, float* cptr) { sincosf(x, sptr, cptr); }
+  __global__ void sincosf_kernel_v5(float x, int* sptr, float* cptr) { sincosf(x, sptr, cptr); }
+  __global__ void sincosf_kernel_v6(float x, long* sptr, float* cptr) { sincosf(x, sptr, cptr); }
+  __global__ void sincosf_kernel_v7(float x, long long* sptr, float* cptr) { sincosf(x, sptr, cptr); }
+  __global__ void sincosf_kernel_v8(float x, double* sptr, float* cptr) { sincosf(x, sptr, cptr); }
+  __global__ void sincosf_kernel_v9(float x, Dummy* sptr, float* cptr) { sincosf(x, sptr, cptr); }
+  __global__ void sincosf_kernel_v10(float x, const float* sptr, float* cptr) {
+    sincosf(x, sptr, cptr);
+  }
+  __global__ void sincosf_kernel_v11(float x, float* sptr, char* cptr) { sincosf(x, sptr, cptr); }
+  __global__ void sincosf_kernel_v12(float x, float* sptr, short* cptr) { sincosf(x, sptr, cptr); }
+  __global__ void sincosf_kernel_v13(float x, float* sptr, int* cptr) { sincosf(x, sptr, cptr); }
+  __global__ void sincosf_kernel_v14(float x, float* sptr, long* cptr) { sincosf(x, sptr, cptr); }
+  __global__ void sincosf_kernel_v15(float x, float* sptr, long long* cptr) {
+    sincosf(x, sptr, cptr);
+  }
+  __global__ void sincosf_kernel_v16(float x, float* sptr, double* cptr) { sincosf(x, sptr, cptr); }
+  __global__ void sincosf_kernel_v17(float x, float* sptr, Dummy* cptr) { sincosf(x, sptr, cptr); }
+  __global__ void sincosf_kernel_v18(float x, float* sptr, const float* cptr) {
+    sincosf(x, sptr, cptr);
+  }
+  __global__ void sincos_kernel_v1(double* x, double* sptr, double* cptr) { sincos(x, sptr, cptr); }
+  __global__ void sincos_kernel_v2(Dummy x, double* sptr, double* cptr) { sincos(x, sptr, cptr); }
+  __global__ void sincos_kernel_v3(double x, char* sptr, double* cptr) { sincos(x, sptr, cptr); }
+  __global__ void sincos_kernel_v4(double x, short* sptr, double* cptr) { sincos(x, sptr, cptr); }
+  __global__ void sincos_kernel_v5(double x, int* sptr, double* cptr) { sincos(x, sptr, cptr); }
+  __global__ void sincos_kernel_v6(double x, long* sptr, double* cptr) { sincos(x, sptr, cptr); }
+  __global__ void sincos_kernel_v7(double x, long long* sptr, double* cptr) { sincos(x, sptr, cptr); }
+  __global__ void sincos_kernel_v8(double x, float* sptr, double* cptr) { sincos(x, sptr, cptr); }
+  __global__ void sincos_kernel_v9(double x, Dummy* sptr, double* cptr) { sincos(x, sptr, cptr); }
+  __global__ void sincos_kernel_v10(double x, const double* sptr, double* cptr) {
+    sincos(x, sptr, cptr);
+  }
+  __global__ void sincos_kernel_v11(double x, double* sptr, char* cptr) { sincos(x, sptr, cptr); }
+  __global__ void sincos_kernel_v12(double x, double* sptr, short* cptr) { sincos(x, sptr, cptr); }
+  __global__ void sincos_kernel_v13(double x, double* sptr, int* cptr) { sincos(x, sptr, cptr); }
+  __global__ void sincos_kernel_v14(double x, double* sptr, long* cptr) { sincos(x, sptr, cptr); }
+  __global__ void sincos_kernel_v15(double x, double* sptr, long long* cptr) {
+    sincos(x, sptr, cptr);
+  }
+  __global__ void sincos_kernel_v16(double x, double* sptr, float* cptr) { sincos(x, sptr, cptr); }
+  __global__ void sincos_kernel_v17(double x, double* sptr, Dummy* cptr) { sincos(x, sptr, cptr); }
+  __global__ void sincos_kernel_v18(double x, double* sptr, const double* cptr) {
+    sincos(x, sptr, cptr);
+  }
+    )"};
+
+static constexpr auto kSincospi{R"(
+  class Dummy {                                                                                    
+   public:                                                                                         
+    __device__ Dummy() {}                                                                          
+    __device__ ~Dummy() {}                                                                         
+  };                                                                                               
+  __global__ void sincospif_kernel_v1(float* x, float* sptr, float* cptr) {
+    sincospif(x, sptr, cptr);
+  }
+  __global__ void sincospif_kernel_v2(Dummy x, float* sptr, float* cptr) { sincospif(x, sptr, cptr); }
+  __global__ void sincospif_kernel_v3(float x, char* sptr, float* cptr) { sincospif(x, sptr, cptr); }
+  __global__ void sincospif_kernel_v4(float x, short* sptr, float* cptr) { sincospif(x, sptr, cptr); }
+  __global__ void sincospif_kernel_v5(float x, int* sptr, float* cptr) { sincospif(x, sptr, cptr); }
+  __global__ void sincospif_kernel_v6(float x, long* sptr, float* cptr) { sincospif(x, sptr, cptr); }
+  __global__ void sincospif_kernel_v7(float x, long long* sptr, float* cptr) {
+    sincospif(x, sptr, cptr);
+  }
+  __global__ void sincospif_kernel_v8(float x, double* sptr, float* cptr) {
+    sincospif(x, sptr, cptr);
+  }
+  __global__ void sincospif_kernel_v9(float x, Dummy* sptr, float* cptr) { sincospif(x, sptr, cptr); }
+  __global__ void sincospif_kernel_v10(float x, const float* sptr, float* cptr) {
+    sincospif(x, sptr, cptr);
+  }
+  __global__ void sincospif_kernel_v11(float x, float* sptr, char* cptr) { sincospif(x, sptr, cptr); }
+  __global__ void sincospif_kernel_v12(float x, float* sptr, short* cptr) {
+    sincospif(x, sptr, cptr);
+  }
+  __global__ void sincospif_kernel_v13(float x, float* sptr, int* cptr) { sincospif(x, sptr, cptr); }
+  __global__ void sincospif_kernel_v14(float x, float* sptr, long* cptr) { sincospif(x, sptr, cptr); }
+  __global__ void sincospif_kernel_v15(float x, float* sptr, long long* cptr) {
+    sincospif(x, sptr, cptr);
+  }
+  __global__ void sincospif_kernel_v16(float x, float* sptr, double* cptr) {
+    sincospif(x, sptr, cptr);
+  }
+  __global__ void sincospif_kernel_v17(float x, float* sptr, Dummy* cptr) {
+    sincospif(x, sptr, cptr);
+  }
+  __global__ void sincospif_kernel_v18(float x, float* sptr, const float* cptr) {
+    sincospif(x, sptr, cptr);
+  }
+  __global__ void sincospi_kernel_v1(float* x, float* sptr, float* cptr) { sincospi(x, sptr, cptr); }
+  __global__ void sincospi_kernel_v2(Dummy x, float* sptr, float* cptr) { sincospi(x, sptr, cptr); }
+  __global__ void sincospi_kernel_v3(float x, char* sptr, float* cptr) { sincospi(x, sptr, cptr); }
+  __global__ void sincospi_kernel_v4(float x, short* sptr, float* cptr) { sincospi(x, sptr, cptr); }
+  __global__ void sincospi_kernel_v5(float x, int* sptr, float* cptr) { sincospi(x, sptr, cptr); }
+  __global__ void sincospi_kernel_v6(float x, long* sptr, float* cptr) { sincospi(x, sptr, cptr); }
+  __global__ void sincospi_kernel_v7(float x, long long* sptr, float* cptr) {
+    sincospi(x, sptr, cptr);
+  }
+  __global__ void sincospi_kernel_v8(float x, double* sptr, float* cptr) { sincospi(x, sptr, cptr); }
+  __global__ void sincospi_kernel_v9(float x, Dummy* sptr, float* cptr) { sincospi(x, sptr, cptr); }
+  __global__ void sincospi_kernel_v10(float x, const float* sptr, float* cptr) {
+    sincospi(x, sptr, cptr);
+  }
+  __global__ void sincospi_kernel_v11(float x, float* sptr, char* cptr) { sincospi(x, sptr, cptr); }
+  __global__ void sincospi_kernel_v12(float x, float* sptr, short* cptr) { sincospi(x, sptr, cptr); }
+  __global__ void sincospi_kernel_v13(float x, float* sptr, int* cptr) { sincospi(x, sptr, cptr); }
+  __global__ void sincospi_kernel_v14(float x, float* sptr, long* cptr) { sincospi(x, sptr, cptr); }
+  __global__ void sincospi_kernel_v15(float x, float* sptr, long long* cptr) {
+    sincospi(x, sptr, cptr);
+  }
+  __global__ void sincospi_kernel_v16(float x, float* sptr, double* cptr) { sincospi(x, sptr, cptr); }
+  __global__ void sincospi_kernel_v17(float x, float* sptr, Dummy* cptr) { sincospi(x, sptr, cptr); }
+  __global__ void sincospi_kernel_v18(float x, float* sptr, const float* cptr) {
+    sincospi(x, sptr, cptr);
+  }
+  )"};
@@ -0,0 +1,118 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+
+class Dummy {
+ public:
+  __device__ Dummy() {}
+  __device__ ~Dummy() {}
+};
+
+#define TRIG_SP_UNARY_NEGATIVE_KERNELS(func_name)                                                  \
+  __global__ void func_name##f_kernel_v1(float* x) { float result = func_name##f(x); }             \
+  __global__ void func_name##f_kernel_v2(Dummy x) { float result = func_name##f(x); }
+
+/*Expecting 2 errors per macro invocation - 26 total*/
+TRIG_SP_UNARY_NEGATIVE_KERNELS(sin)
+TRIG_SP_UNARY_NEGATIVE_KERNELS(cos)
+TRIG_SP_UNARY_NEGATIVE_KERNELS(tan)
+TRIG_SP_UNARY_NEGATIVE_KERNELS(asin)
+TRIG_SP_UNARY_NEGATIVE_KERNELS(acos)
+TRIG_SP_UNARY_NEGATIVE_KERNELS(atan)
+TRIG_SP_UNARY_NEGATIVE_KERNELS(sinh)
+TRIG_SP_UNARY_NEGATIVE_KERNELS(cosh)
+TRIG_SP_UNARY_NEGATIVE_KERNELS(tanh)
+TRIG_SP_UNARY_NEGATIVE_KERNELS(asinh)
+TRIG_SP_UNARY_NEGATIVE_KERNELS(atanh)
+TRIG_SP_UNARY_NEGATIVE_KERNELS(sinpi)
+TRIG_SP_UNARY_NEGATIVE_KERNELS(cospi)
+
+/*Expecting 4 errors*/
+__global__ void atan2f_kernel_v1(float* x, float y) { float result = atan2f(x, y); }
+__global__ void atan2f_kernel_v2(float x, float* y) { float result = atan2f(x, y); }
+__global__ void atan2f_kernel_v3(Dummy x, float y) { float result = atan2f(x, y); }
+__global__ void atan2f_kernel_v4(float x, Dummy y) { float result = atan2f(x, y); }
+
+/*Expecting 18 errors*/
+__global__ void sincosf_kernel_v1(float* x, float* sptr, float* cptr) { sincosf(x, sptr, cptr); }
+__global__ void sincosf_kernel_v2(Dummy x, float* sptr, float* cptr) { sincosf(x, sptr, cptr); }
+__global__ void sincosf_kernel_v3(float x, char* sptr, float* cptr) { sincosf(x, sptr, cptr); }
+__global__ void sincosf_kernel_v4(float x, short* sptr, float* cptr) { sincosf(x, sptr, cptr); }
+__global__ void sincosf_kernel_v5(float x, int* sptr, float* cptr) { sincosf(x, sptr, cptr); }
+__global__ void sincosf_kernel_v6(float x, long* sptr, float* cptr) { sincosf(x, sptr, cptr); }
+__global__ void sincosf_kernel_v7(float x, long long* sptr, float* cptr) { sincosf(x, sptr, cptr); }
+__global__ void sincosf_kernel_v8(float x, double* sptr, float* cptr) { sincosf(x, sptr, cptr); }
+__global__ void sincosf_kernel_v9(float x, Dummy* sptr, float* cptr) { sincosf(x, sptr, cptr); }
+__global__ void sincosf_kernel_v10(float x, const float* sptr, float* cptr) {
+  sincosf(x, sptr, cptr);
+}
+__global__ void sincosf_kernel_v11(float x, float* sptr, char* cptr) { sincosf(x, sptr, cptr); }
+__global__ void sincosf_kernel_v12(float x, float* sptr, short* cptr) { sincosf(x, sptr, cptr); }
+__global__ void sincosf_kernel_v13(float x, float* sptr, int* cptr) { sincosf(x, sptr, cptr); }
+__global__ void sincosf_kernel_v14(float x, float* sptr, long* cptr) { sincosf(x, sptr, cptr); }
+__global__ void sincosf_kernel_v15(float x, float* sptr, long long* cptr) {
+  sincosf(x, sptr, cptr);
+}
+__global__ void sincosf_kernel_v16(float x, float* sptr, double* cptr) { sincosf(x, sptr, cptr); }
+__global__ void sincosf_kernel_v17(float x, float* sptr, Dummy* cptr) { sincosf(x, sptr, cptr); }
+__global__ void sincosf_kernel_v18(float x, float* sptr, const float* cptr) {
+  sincosf(x, sptr, cptr);
+}
+
+/*Expecting 18 errors*/
+__global__ void sincospif_kernel_v1(float* x, float* sptr, float* cptr) {
+  sincospif(x, sptr, cptr);
+}
+__global__ void sincospif_kernel_v2(Dummy x, float* sptr, float* cptr) { sincospif(x, sptr, cptr); }
+__global__ void sincospif_kernel_v3(float x, char* sptr, float* cptr) { sincospif(x, sptr, cptr); }
+__global__ void sincospif_kernel_v4(float x, short* sptr, float* cptr) { sincospif(x, sptr, cptr); }
+__global__ void sincospif_kernel_v5(float x, int* sptr, float* cptr) { sincospif(x, sptr, cptr); }
+__global__ void sincospif_kernel_v6(float x, long* sptr, float* cptr) { sincospif(x, sptr, cptr); }
+__global__ void sincospif_kernel_v7(float x, long long* sptr, float* cptr) {
+  sincospif(x, sptr, cptr);
+}
+__global__ void sincospif_kernel_v8(float x, double* sptr, float* cptr) {
+  sincospif(x, sptr, cptr);
+}
+__global__ void sincospif_kernel_v9(float x, Dummy* sptr, float* cptr) { sincospif(x, sptr, cptr); }
+__global__ void sincospif_kernel_v10(float x, const float* sptr, float* cptr) {
+  sincospif(x, sptr, cptr);
+}
+__global__ void sincospif_kernel_v11(float x, float* sptr, char* cptr) { sincospif(x, sptr, cptr); }
+__global__ void sincospif_kernel_v12(float x, float* sptr, short* cptr) {
+  sincospif(x, sptr, cptr);
+}
+__global__ void sincospif_kernel_v13(float x, float* sptr, int* cptr) { sincospif(x, sptr, cptr); }
+__global__ void sincospif_kernel_v14(float x, float* sptr, long* cptr) { sincospif(x, sptr, cptr); }
+__global__ void sincospif_kernel_v15(float x, float* sptr, long long* cptr) {
+  sincospif(x, sptr, cptr);
+}
+__global__ void sincospif_kernel_v16(float x, float* sptr, double* cptr) {
+  sincospif(x, sptr, cptr);
+}
+__global__ void sincospif_kernel_v17(float x, float* sptr, Dummy* cptr) {
+  sincospif(x, sptr, cptr);
+}
+__global__ void sincospif_kernel_v18(float x, float* sptr, const float* cptr) {
+  sincospif(x, sptr, cptr);
+}
@@ -0,0 +1,198 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include "math_common.hh"
+#include "math_special_values.hh"
+
+#include <hip/hip_cooperative_groups.h>
+
+namespace cg = cooperative_groups;
+
+#define MATH_UNARY_KERNEL_DEF(func_name)                                                           \
+  template <typename T, typename RT = T>                                                           \
+  __global__ void func_name##_kernel(RT* const ys, const size_t num_xs, T* const xs) {             \
+    const auto tid = cg::this_grid().thread_rank();                                                \
+    const auto stride = cg::this_grid().size();                                                    \
+                                                                                                   \
+    for (auto i = tid; i < num_xs; i += stride) {                                                  \
+      if constexpr (std::is_same_v<float, T>) {                                                    \
+        ys[i] = func_name##f(xs[i]);                                                               \
+      } else if constexpr (std::is_same_v<double, T>) {                                            \
+        ys[i] = func_name(xs[i]);                                                                  \
+      }                                                                                            \
+    }                                                                                              \
+  }
+
+template <typename T, typename RT, typename RTArg, typename ValidatorBuilder>
+void UnarySinglePrecisionBruteForceTest(kernel_sig<T, float> kernel, ref_sig<RT, RTArg> ref_func,
+                                        const ValidatorBuilder& validator_builder) {
+  const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
+  uint64_t stop = std::numeric_limits<uint32_t>::max() + 1ul;
+  const auto max_batch_size =
+      std::min(GetMaxAllowedDeviceMemoryUsage() / (sizeof(float) + sizeof(T)), stop);
+  LinearAllocGuard<float> values{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(float)};
+
+  MathTest math_test(kernel, max_batch_size);
+
+  auto batch_size = max_batch_size;
+  const auto num_threads = thread_pool.thread_count();
+
+  for (uint64_t v = 0u; v < stop;) {
+    batch_size = std::min<uint64_t>(max_batch_size, stop - v);
+
+    const auto min_sub_batch_size = batch_size / num_threads;
+    const auto tail = batch_size % num_threads;
+
+    auto base_idx = 0u;
+    for (auto i = 0u; i < num_threads; ++i) {
+      const auto sub_batch_size = min_sub_batch_size + (i < tail);
+
+      thread_pool.Post([=, &values] {
+        auto t = v;
+        uint32_t val;
+        for (auto j = 0u; j < sub_batch_size; ++j) {
+          val = static_cast<uint32_t>(t++);
+          values.ptr()[base_idx + j] = *reinterpret_cast<float*>(&val);
+        }
+      });
+
+      v += sub_batch_size;
+      base_idx += sub_batch_size;
+    }
+
+    thread_pool.Wait();
+
+    math_test.Run(validator_builder, grid_size, block_size, ref_func, batch_size, values.ptr());
+  }
+}
+
+template <typename T, typename RT, typename RTArg, typename ValidatorBuilder>
+void UnarySinglePrecisionRangeTest(kernel_sig<T, float> kernel, ref_sig<RT, RTArg> ref_func,
+                                   const ValidatorBuilder& validator_builder, const float a,
+                                   const float b) {
+  const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
+  uint64_t stop = std::numeric_limits<uint32_t>::max() + 1ul;
+  const auto max_batch_size = GetMaxAllowedDeviceMemoryUsage() / (sizeof(float) + sizeof(T));
+  LinearAllocGuard<float> values{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(float)};
+
+  MathTest math_test(kernel, max_batch_size);
+
+  uint32_t val = 0u;
+  const auto num_threads = thread_pool.thread_count();
+
+  size_t inserted = 0u;
+  for (float v = a; v != b; v = std::nextafter(v, b)) {
+    values.ptr()[inserted++] = v;
+    if (inserted < max_batch_size) continue;
+
+    math_test.Run(validator_builder, grid_size, block_size, ref_func, inserted, values.ptr());
+    inserted = 0u;
+  }
+}
+
+template <typename T, typename RT, typename RTArg, typename ValidatorBuilder>
+void UnaryDoublePrecisionBruteForceTest(kernel_sig<T, double> kernel, ref_sig<RT, RTArg> ref_func,
+                                        const ValidatorBuilder& validator_builder,
+                                        const double a = std::numeric_limits<double>::lowest(),
+                                        const double b = std::numeric_limits<double>::max()) {
+  const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
+  const uint64_t num_iterations = GetTestIterationCount();
+  const auto max_batch_size =
+      std::min(GetMaxAllowedDeviceMemoryUsage() / (sizeof(double) + sizeof(T)), num_iterations);
+  LinearAllocGuard<double> values{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(double)};
+
+  MathTest math_test(kernel, max_batch_size);
+
+  auto batch_size = max_batch_size;
+  const auto num_threads = thread_pool.thread_count();
+  for (uint64_t i = 0ul; i < num_iterations; i += batch_size) {
+    batch_size = std::min<uint64_t>(max_batch_size, num_iterations - i);
+
+    const auto min_sub_batch_size = batch_size / num_threads;
+    const auto tail = batch_size % num_threads;
+
+    auto base_idx = 0u;
+    for (auto i = 0u; i < num_threads; ++i) {
+      const auto sub_batch_size = min_sub_batch_size + (i < tail);
+      thread_pool.Post([=, &values] {
+        const auto generator = [=] {
+          static thread_local std::mt19937 rng(std::random_device{}());
+          std::uniform_real_distribution<long double> unif_dist(a, b);
+          return static_cast<double>(unif_dist(rng));
+        };
+        std::generate(values.ptr() + base_idx, values.ptr() + base_idx + sub_batch_size, generator);
+      });
+      base_idx += sub_batch_size;
+    }
+
+    thread_pool.Wait();
+
+    math_test.Run(validator_builder, grid_size, block_size, ref_func, batch_size, values.ptr());
+  }
+}
+
+template <typename T, typename RT, typename RTArg, typename ValidatorBuilder>
+void UnaryDoublePrecisionSpecialValuesTest(kernel_sig<T, double> kernel,
+                                           ref_sig<RT, RTArg> ref_func,
+                                           const ValidatorBuilder& validator_builder) {
+  const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
+  const auto values = std::get<SpecialVals<double>>(kSpecialValRegistry);
+
+  MathTest math_test(kernel, values.size);
+  math_test.template Run<false>(validator_builder, grid_size, block_size, ref_func, values.size,
+                                values.data);
+}
+
+template <typename T, typename RT, typename RTArg, typename ValidatorBuilder>
+void UnarySinglePrecisionTest(kernel_sig<T, float> kernel, ref_sig<RT, RTArg> ref,
+                              const ValidatorBuilder& validator_builder) {
+  SECTION("Brute force") { UnarySinglePrecisionBruteForceTest(kernel, ref, validator_builder); }
+}
+
+template <typename T, typename RT, typename RTArg, typename ValidatorBuilder>
+void UnaryDoublePrecisionTest(kernel_sig<T, double> kernel, ref_sig<RT, RTArg> ref,
+                              const ValidatorBuilder& validator_builder) {
+  SECTION("Special values") {
+    UnaryDoublePrecisionSpecialValuesTest(kernel, ref, validator_builder);
+  }
+
+  SECTION("Brute force") { UnaryDoublePrecisionBruteForceTest(kernel, ref, validator_builder); }
+}
+
+#define MATH_UNARY_WITHIN_ULP_TEST_DEF(kern_name, ref_func, sp_ulp, dp_ulp)                        \
+  MATH_UNARY_KERNEL_DEF(kern_name)                                                                 \
+                                                                                                   \
+  TEST_CASE("Unit_Device_" #kern_name "_Accuracy_Positive - float") {                              \
+    double (*ref)(double) = ref_func;                                                              \
+    UnarySinglePrecisionTest(kern_name##_kernel<float>, ref,                                       \
+                             ULPValidatorBuilderFactory<float>(sp_ulp));                           \
+  }                                                                                                \
+                                                                                                   \
+  TEST_CASE("Unit_Device_" #kern_name "_Accuracy_Positive - double") {                             \
+    long double (*ref)(long double) = ref_func;                                                    \
+    UnaryDoublePrecisionTest(kern_name##_kernel<double>, ref,                                      \
+                             ULPValidatorBuilderFactory<double>(dp_ulp));                          \
+  }
+
+#define MATH_UNARY_WITHIN_ULP_STL_REF_TEST_DEF(func_name, sp_ulp, dp_ulp)                          \
+  MATH_UNARY_WITHIN_ULP_TEST_DEF(func_name, std::func_name, sp_ulp, dp_ulp)
@@ -0,0 +1,152 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <catch.hpp>
+
+// Define a new MatcherBase class with a public 'describe' member function because
+// Catch::MatcherBase::describe is protected and thus can't be used via a pointer to
+// Catch::MatcherBase.
+template <typename T> class MatcherBase : public Catch::MatcherBase<T> {
+ public:
+  virtual std::string describe() const = 0;
+  virtual ~MatcherBase() = default;
+};
+
+template <typename T, typename Matcher> class ValidatorBase : public MatcherBase<T> {
+ public:
+  template <typename... Ts>
+  ValidatorBase(T target, Ts&&... args) : matcher_{std::forward<Ts>(args)...}, target_{target} {}
+
+  bool match(const T& val) const override {
+    if (std::isnan(target_)) {
+      return std::isnan(val);
+    }
+
+    return matcher_.match(val);
+  }
+
+  std::string describe() const override {
+    if (std::isnan(target_)) {
+      return "is not NaN";
+    }
+
+    return matcher_.describe();
+  }
+
+ private:
+  Matcher matcher_;
+  T target_;
+  bool nan = false;
+};
+
+template <typename T> auto ULPValidatorBuilderFactory(int64_t ulps) {
+  return [=](T target, auto&&... args) {
+    return std::make_unique<ValidatorBase<T, Catch::Matchers::Floating::WithinUlpsMatcher>>(
+        target, Catch::WithinULP(target, ulps));
+  };
+};
+
+template <typename T> auto AbsValidatorBuilderFactory(double margin) {
+  return [=](T target, auto&&... args) {
+    return std::make_unique<ValidatorBase<T, Catch::Matchers::Floating::WithinAbsMatcher>>(
+        target, Catch::WithinAbs(target, margin));
+  };
+}
+
+template <typename T> auto RelValidatorBuilderFactory(T margin) {
+  return [=](T target, auto&&... args) {
+    return std::make_unique<ValidatorBase<T, Catch::Matchers::Floating::WithinRelMatcher>>(
+        target, Catch::WithinRel(target, margin));
+  };
+}
+
+template <typename T> class EqValidator : public MatcherBase<T> {
+ public:
+  EqValidator(T target) : target_{target} {}
+
+  bool match(const T& val) const override {
+    if (std::isnan(target_)) {
+      return std::isnan(val);
+    }
+
+    return target_ == val;
+  }
+
+  std::string describe() const override {
+    std::stringstream ss;
+    ss << " is not equal to " << target_;
+    return ss.str();
+  }
+
+ private:
+  T target_;
+};
+
+template <typename T> auto EqValidatorBuilderFactory() {
+  return [](T val, auto&&... args) { return std::make_unique<EqValidator<T>>(val); };
+}
+
+template <typename T, typename U, typename VBF, typename VBS>
+class PairValidator : public MatcherBase<std::pair<T, U>> {
+ public:
+  PairValidator(const std::pair<T, U>& target, const VBF& vbf, const VBS& vbs)
+      : first_matcher_{vbf(target.first)}, second_matcher_{vbs(target.second)} {}
+
+  bool match(const std::pair<T, U>& val) const override {
+    return first_matcher_->match(val.first) && second_matcher_->match(val.second);
+  }
+
+  std::string describe() const override {
+    return "<" + first_matcher_->describe() + ", " + second_matcher_->describe() + ">";
+  }
+
+ private:
+  decltype(std::declval<VBF>()(std::declval<T>())) first_matcher_;
+  decltype(std::declval<VBS>()(std::declval<U>())) second_matcher_;
+};
+
+template <typename T, typename ValidatorBuilder>
+auto PairValidatorBuilderFactory(const ValidatorBuilder& vb) {
+  return [=](const std::pair<T, T>& t, auto&&... args) {
+    return std::make_unique<PairValidator<T, T, ValidatorBuilder, ValidatorBuilder>>(t, vb, vb);
+  };
+}
+
+template <typename T, typename U, typename VBF, typename VBS>
+auto PairValidatorBuilderFactory(const VBF& vbf, const VBS& vbs) {
+  return [=](const std::pair<T, U>& t, auto&&... args) {
+    return std::make_unique<PairValidator<T, U, VBF, VBS>>(t, vbf, vbs);
+  };
+}
+
+template <typename T> class NopValidator : public MatcherBase<T> {
+ public:
+  bool match(const T& val) const override { return true; }
+
+  std::string describe() const override { return ""; }
+};
+
+template <typename T> auto NopValidatorBuilderFactory() {
+  return [](auto&&... args) { return std::make_unique<NopValidator<T>>(); };
+}