/* Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #pragma once #include #include #include #include #include "Float16.hh" #include "thread_pool.hh" #include "validators.hh" namespace cg = cooperative_groups; template std::enable_if_t, std::is_arithmetic>, std::ostream&> operator<<(std::ostream& os, const std::pair& p) { const auto default_prec = os.precision(); return os << "<" << std::setprecision(std::numeric_limits::max_digits10 - 1) << p.first << ", " << std::setprecision(std::numeric_limits::max_digits10 - 1) << p.second << ">" << std::setprecision(default_prec); } template std::enable_if_t, std::ostream&> operator<<(std::ostream& os, const T& p) { const auto default_prec = os.precision(); return os << "<" << std::setprecision(std::numeric_limits::max_digits10 - 1) << p.x << ", " << std::setprecision(std::numeric_limits::max_digits10 - 1) << p.y << ">" << std::setprecision(default_prec); } // This class represents a generic numerical accuracy math test. Template parameter T is the output // type of the function being tested, and template parameter pack Ts represents the input types. The // constructor takes a kernel with the signature void(T*, const size_t, Ts*...). The first kernel // parameter is the output array, the second parameter is the number of outputs, and the rest of the // parameters are arrays containing input values. The number of input arrays depends on the arity of // the function being tested e.g. one input array for unary functions, two input arrays for binary // functions, etc. The kernel threads take one element from each input array at the index // corresponding to that thread, feed the input elements to the testee function, and store the // result in the output array at the corresponding index. // // E.g. for a binary function the kernel would have the following signature: // void kernel(float* y, const size_t n, float* x1, float* x2) // // The outputs would be calculated in parallel the following way: // y[0] = testee(x1[0], x2[0]) // y[1] = testee(x1[1], x2[1]) // y[2] = testee(x1[2], x2[2]) // ... // // The constructor also takes max_num_args, which represents the maximum number of input values used // for one kernel launch. The device memory for the input and output arrays is allocated based on // that number. template class MathTest { public: MathTest(void (*kernel)(T*, const size_t, Ts*...), const size_t max_num_args) : kernel_{kernel}, xss_dev_(LinearAllocGuard(LinearAllocs::hipMalloc, max_num_args * sizeof(Ts))...), y_dev_{LinearAllocs::hipMalloc, max_num_args * sizeof(T)}, y_{LinearAllocs::hipHostMalloc, max_num_args * sizeof(T)} {} // This method runs the test with the following steps: // 1. Copy the values from the input arrays provided in the parameter pack xss to device memory // 2. Launch the kernel using the configuration provided in grid_dims and block_dims // 3. Copy the outputs back to host memory // 4. Generate the reference values using ref_func and compare against the outputs using the // validator provided by validator_builder // 5. If non-type template parameter parallel is true, then step 4 is broken up into chunks of // work that are done in parallel on the host. template void Run(const ValidatorBuilder& validator_builder, const size_t grid_dims, const size_t block_dims, RT (*const ref_func)(RTs...), const size_t num_args, const Ts*... xss) { fail_flag_.store(false); error_info_.clear(); RunImpl(validator_builder, grid_dims, block_dims, ref_func, num_args, std::index_sequence_for{}, xss...); } private: void (*kernel_)(T*, const size_t, Ts*...); std::tuple...> xss_dev_; LinearAllocGuard y_dev_; LinearAllocGuard y_; std::atomic fail_flag_{false}; std::mutex mtx_; std::string error_info_; template void RunImpl(const ValidatorBuilder& validator_builder, const size_t grid_dim, const size_t block_dim, RT (*const ref_func)(RTs...), const size_t num_args, std::index_sequence, const Ts*... xss) { const auto xss_tup = std::make_tuple(xss...); constexpr auto f = [](auto dst, auto src, size_t size) { HIP_CHECK(hipMemcpy(dst, src, size, hipMemcpyHostToDevice)) }; ((f(std::get(xss_dev_).ptr(), std::get(xss_tup), num_args * sizeof(*std::get(xss_tup)))), ...); kernel_<<>>(y_dev_.ptr(), num_args, std::get(xss_dev_).ptr()...); HIP_CHECK(hipGetLastError()); HIP_CHECK(hipMemcpy(y_.ptr(), y_dev_.ptr(), num_args * sizeof(T), hipMemcpyDeviceToHost)); HIP_CHECK(hipStreamSynchronize(nullptr)); if constexpr (!parallel) { for (auto i = 0u; i < num_args; ++i) { const auto actual_val = y_.ptr()[i]; const auto ref_val = static_cast(ref_func(xss[i]...)); const auto validator = validator_builder(ref_val, xss[i]...); if (!validator->match(actual_val)) { const auto log = MakeLogMessage(actual_val, xss[i]...) + validator->describe() + "\n"; INFO(log); REQUIRE(false); } } return; } const auto task = [&, this](size_t iters, size_t base_idx) { for (auto i = 0u; i < iters; ++i) { if (fail_flag_.load(std::memory_order_relaxed)) return; const auto actual_val = y_.ptr()[base_idx + i]; const auto ref_val = static_cast(ref_func(xss[base_idx + i]...)); const auto validator = validator_builder(ref_val, xss[base_idx + i]...); if (!validator->match(actual_val)) { fail_flag_.store(true, std::memory_order_relaxed); // Several threads might have passed the first check, but failed validation. On the // chance of this happening, access to the string stream must be serialized. const auto log = MakeLogMessage(actual_val, xss[base_idx + i]...) + validator->describe() + "\n"; { std::lock_guard lg{mtx_}; error_info_ += log; } return; } } }; const auto task_count = thread_pool.thread_count(); const auto chunk_size = num_args / task_count; const auto tail = num_args % task_count; auto base_idx = 0u; for (auto i = 0u; i < task_count; ++i) { const auto iters = chunk_size + (i < tail); thread_pool.Post([=, &task] { task(iters, base_idx); }); base_idx += iters; } thread_pool.Wait(); INFO(error_info_); REQUIRE(!fail_flag_); } template std::string MakeLogMessage(T actual_val, Args... args) { std::stringstream ss; ss << "Input value(s): " << std::scientific << std::setprecision(std::numeric_limits::max_digits10 - 1); ((ss << " " << args), ...) << "\n" << "Output value: " << actual_val << "\n" << "Condition failed: "; return ss.str(); } }; template struct RefType {}; template <> struct RefType { using type = float; }; template <> struct RefType { using type = double; }; template <> struct RefType { using type = long double; }; template using RefType_t = typename RefType::type; template auto GetOccupancyMaxPotentialBlockSize(F kernel) { int grid_size = 0, block_size = 0; HIP_CHECK(hipOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, 0, 0)); return std::make_tuple(grid_size, block_size); } inline size_t GetMaxAllowedDeviceMemoryUsage() { hipDeviceProp_t props; HIP_CHECK(hipGetDeviceProperties(&props, 0)); return props.totalGlobalMem > cmd_options.max_memory ? cmd_options.max_memory : props.totalGlobalMem * (cmd_options.accuracy_max_memory * 0.01f); } inline double GetTestReductionFactor() { return cmd_options.reduction_factor * 0.01; } inline uint64_t GetTestIterationCount() { return static_cast( std::ceil(cmd_options.accuracy_iterations * GetTestReductionFactor())); } template using kernel_sig = void (*)(T*, const size_t, Ts*...); template using ref_sig = T (*)(Ts...); template void NegativeTestRTCWrapper(const char* program_source) { hiprtcProgram program{}; HIPRTC_CHECK( hiprtcCreateProgram(&program, program_source, "math_test_rtc.cc", 0, nullptr, nullptr)); #if HT_AMD std::string args = std::string("-ferror-limit=200"); const char* options[] = {args.c_str()}; hiprtcResult result{hiprtcCompileProgram(program, 1, options)}; #else hiprtcResult result{hiprtcCompileProgram(program, 0, nullptr)}; #endif // Get the compile log and count compiler error messages size_t log_size{}; HIPRTC_CHECK(hiprtcGetProgramLogSize(program, &log_size)); std::string log(log_size, ' '); HIPRTC_CHECK(hiprtcGetProgramLog(program, log.data())); int error_count{0}; int expected_error_count{error_num}; std::string error_message{"error:"}; size_t n_pos = log.find(error_message, 0); while (n_pos != std::string::npos) { ++error_count; n_pos = log.find(error_message, n_pos + 1); } HIPRTC_CHECK(hiprtcDestroyProgram(&program)); HIPRTC_CHECK_ERROR(result, HIPRTC_ERROR_COMPILATION); REQUIRE(error_count == expected_error_count); } inline void SinglePrecisionReducedRun(std::function run, const LinearAllocGuard& values, const float a, const float b, const double reduction_factor, const size_t max_batch_size) { bool test_positive = true; float positive_start = 0.0f; float positive_end = 0.0f; bool test_negative = true; float negative_start = 0.0f; float negative_end = 0.0f; if (a < 0 && b <= 0) { test_positive = false; negative_start = -b; negative_end = -a; } if (a < 0 && b > 0) { positive_start = 0.0f; positive_end = b; negative_start = 0.0f; negative_end = -a; } if (a >= 0 && b > 0) { positive_start = a; positive_end = b; test_negative = false; } const auto inv_reduction_factor = 1 / reduction_factor; size_t inserted = 0u; constexpr int radix = std::numeric_limits::radix; float increment = std::numeric_limits::min() * std::numeric_limits::epsilon(); float limit = std::numeric_limits::min() * radix; const auto iterate = [&](float start, float end, int positive) { for (float v = start; v < end; limit *= radix, increment *= radix) { const auto start_v = v; double count = 0ul; while (v < limit && v < end) { values.ptr()[inserted++] = (v * positive); count += inv_reduction_factor; v = start_v + increment * static_cast(std::floor(count)); if (inserted < max_batch_size) continue; run(inserted); inserted = 0u; } } if (inserted > 0u) { run(inserted); inserted = 0u; } }; if (test_positive) { iterate(positive_start, positive_end, 1); } increment = std::numeric_limits::min() * std::numeric_limits::epsilon(); limit = std::numeric_limits::min() * radix; if (test_negative) { iterate(negative_start, negative_end, -1); } }