diff --git a/hipamd/CMakeLists.txt b/hipamd/CMakeLists.txt index 1c8f640afb..e405d06ed6 100644 --- a/hipamd/CMakeLists.txt +++ b/hipamd/CMakeLists.txt @@ -167,6 +167,7 @@ if(HIP_PLATFORM STREQUAL "hcc") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${HIP_HCC_BUILD_FLAGS}") set(SOURCE_FILES_RUNTIME + src/code_object_bundle.cpp src/hip_hcc.cpp src/hip_context.cpp src/hip_device.cpp @@ -179,7 +180,8 @@ if(HIP_PLATFORM STREQUAL "hcc") src/hip_db.cpp src/grid_launch.cpp src/hip_texture.cpp - src/env.cpp) + src/env.cpp + src/program_state.cpp) set(SOURCE_FILES_DEVICE src/device_util.cpp diff --git a/hipamd/include/hip/hcc_detail/code_object_bundle.hpp b/hipamd/include/hip/hcc_detail/code_object_bundle.hpp new file mode 100644 index 0000000000..72f9d35c73 --- /dev/null +++ b/hipamd/include/hip/hcc_detail/code_object_bundle.hpp @@ -0,0 +1,158 @@ +/* +Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace hip_impl +{ + hsa_isa_t triple_to_hsa_isa(const std::string& triple); + + struct Bundled_code { + union { + struct { + std::uint64_t offset; + std::uint64_t bundle_sz; + std::uint64_t triple_sz; + }; + std::uint8_t cbuf[ + sizeof(offset) + sizeof(bundle_sz) + sizeof(triple_sz)]; + }; + std::string triple; + std::vector blob; + }; + + class Bundled_code_header { + // DATA - STATICS + static constexpr const char magic_string_[] = + "__CLANG_OFFLOAD_BUNDLE__"; + static constexpr auto magic_string_sz_ = sizeof(magic_string_) - 1; + + // DATA + union { + struct { + std::uint8_t bundler_magic_string_[magic_string_sz_]; + std::uint64_t bundle_cnt_; + }; + std::uint8_t cbuf_[ + sizeof(bundler_magic_string_) + sizeof(bundle_cnt_)]; + }; + std::vector bundles_; + + // FRIENDS - MANIPULATORS + template + friend + inline + bool read( + RandomAccessIterator f, + RandomAccessIterator l, + Bundled_code_header& x) + { + if (f == l) return false; + + std::copy_n(f, sizeof(x.cbuf_), x.cbuf_); + + if (valid(x)) { + x.bundles_.resize(x.bundle_cnt_); + + auto it = f + sizeof(x.cbuf_); + for (auto&& y : x.bundles_) { + std::copy_n(it, sizeof(y.cbuf), y.cbuf); + it += sizeof(y.cbuf); + + y.triple.insert(y.triple.cend(), it, it + y.triple_sz); + + std::copy_n( + f + y.offset, y.bundle_sz, std::back_inserter(y.blob)); + + it += y.triple_sz; + } + + return true; + } + + return false; + } + friend + inline + bool read(const std::vector& blob, Bundled_code_header& x) + { + return read(blob.cbegin(), blob.cend(), x); + } + friend + inline + bool read(std::istream& is, Bundled_code_header& x) + { + return read(std::vector{ + std::istreambuf_iterator{is}, + std::istreambuf_iterator{}}, + x); + } + + // FRIENDS - ACCESSORS + friend + inline + bool valid(const Bundled_code_header& x) + { + return std::equal( + x.bundler_magic_string_, + x.bundler_magic_string_ + magic_string_sz_, + x.magic_string_); + } + friend + inline + const std::vector& bundles(const Bundled_code_header& x) + { + return x.bundles_; + } + public: + // CREATORS + Bundled_code_header() = default; + template + Bundled_code_header(RandomAccessIterator f, RandomAccessIterator l); + explicit + Bundled_code_header(const std::vector& blob); + Bundled_code_header(const Bundled_code_header&) = default; + Bundled_code_header(Bundled_code_header&&) = default; + ~Bundled_code_header() = default; + + // MANIPULATORS + Bundled_code_header& operator=(const Bundled_code_header&) = default; + Bundled_code_header& operator=(Bundled_code_header&&) = default; + }; + + // CREATORS + template + Bundled_code_header::Bundled_code_header(I f, I l) : Bundled_code_header{} + { + read(f, l, *this); + } +} // Namespace hip_impl. \ No newline at end of file diff --git a/hipamd/include/hip/hcc_detail/concepts.hpp b/hipamd/include/hip/hcc_detail/concepts.hpp index 5c50f5d577..18c1119b73 100644 --- a/hipamd/include/hip/hcc_detail/concepts.hpp +++ b/hipamd/include/hip/hcc_detail/concepts.hpp @@ -1,5 +1,5 @@ /* -Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/hipamd/include/hip/hcc_detail/functional_grid_launch.hpp b/hipamd/include/hip/hcc_detail/functional_grid_launch.hpp new file mode 100644 index 0000000000..bbffae52e8 --- /dev/null +++ b/hipamd/include/hip/hcc_detail/functional_grid_launch.hpp @@ -0,0 +1,159 @@ +/* +Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#include "code_object_bundle.hpp" +#include "concepts.hpp" +#include "helpers.hpp" +#include "program_state.hpp" + +#include "hc.hpp" +#include "hip/hip_hcc.h" +#include "hip_runtime.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace hip_impl +{ + template< + typename T, + typename std::enable_if{}>::type* = nullptr> + inline + T round_up_to_next_multiple_nonnegative(T x, T y) + { + T tmp = x + y - 1; + return tmp - tmp % y; + } + + inline + std::vector make_kernarg() + { + return {}; + } + + inline + std::vector make_kernarg(std::vector kernarg) + { + return kernarg; + } + + template + inline + std::vector make_kernarg(std::vector kernarg, T x) + { + kernarg.resize( + round_up_to_next_multiple_nonnegative(kernarg.size(), alignof(T)) + + sizeof(T)); + + new (kernarg.data() + kernarg.size() - sizeof(T)) T{std::move(x)}; + + return kernarg; + } + + template + inline + std::vector make_kernarg( + std::vector kernarg, T x, Ts... xs) + { + return make_kernarg( + make_kernarg(std::move(kernarg), std::move(x)), std::move(xs)...); + } + + template + inline + std::vector make_kernarg(Ts... xs) + { + std::vector kernarg; + kernarg.reserve(sizeof(std::tuple)); + + return make_kernarg(std::move(kernarg), std::move(xs)...); + } + + void hipLaunchKernelGGLImpl( + std::uintptr_t function_address, + const dim3& numBlocks, + const dim3& dimBlocks, + std::uint32_t sharedMemBytes, + hipStream_t stream, + void** kernarg); +} // Namespace hip_impl. + +template +inline +void hipLaunchKernelGGL( + F kernel, + const dim3& numBlocks, + const dim3& dimBlocks, + std::uint32_t sharedMemBytes, + hipStream_t stream, + Args... args) +{ + auto kernarg = hip_impl::make_kernarg(std::move(args)...); + std::size_t kernarg_size = kernarg.size(); + + void* config[] = { + HIP_LAUNCH_PARAM_BUFFER_POINTER, kernarg.data(), + HIP_LAUNCH_PARAM_BUFFER_SIZE, &kernarg_size, + HIP_LAUNCH_PARAM_END + }; + + hip_impl::hipLaunchKernelGGLImpl( + reinterpret_cast(kernel), + numBlocks, + dimBlocks, + sharedMemBytes, + stream, + &config[0]); +} + +template +inline +void hipLaunchKernel( + F kernel, + const dim3& numBlocks, + const dim3& dimBlocks, + std::uint32_t groupMemBytes, + hipStream_t stream, + Args... args) +{ + hipLaunchKernelGGL( + kernel, + numBlocks, + dimBlocks, + groupMemBytes, + stream, + hipLaunchParm{}, + std::move(args)...); +} + diff --git a/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp b/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp index 4c632f9d68..95903436b6 100644 --- a/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp +++ b/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp @@ -19,989 +19,12 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ - #pragma once #if GENERIC_GRID_LAUNCH == 1 - -#include "concepts.hpp" -#include "helpers.hpp" - -#include "hc.hpp" -#include "hip/hip_hcc.h" -#include "hip_runtime.h" - -#include -#include -#include -#include -#include - -namespace hip_impl -{ - namespace - { - struct New_grid_launch_tag {}; - struct Old_grid_launch_tag {}; - - template - class RAII_guard { - D dtor_; - public: - RAII_guard() = default; - - RAII_guard(const C& ctor, D dtor) : dtor_{std::move(dtor)} - { - ctor(); - } - - RAII_guard(const RAII_guard&) = default; - RAII_guard(RAII_guard&&) = default; - - RAII_guard& operator=(const RAII_guard&) = default; - RAII_guard& operator=(RAII_guard&&) = default; - - ~RAII_guard() { dtor_(); } - }; - - template - RAII_guard make_RAII_guard(const C& ctor, D dtor) - { - return RAII_guard{ctor, std::move(dtor)}; - } - - template - using is_new_grid_launch_t = typename std::conditional< - is_callable{}, - New_grid_launch_tag, - Old_grid_launch_tag>::type; - } - - // TODO: - dispatch rank should be derived from the domain dimensions passed - // in, and not always assumed to be 3; - - template - requires(Domain == {Ts...}) - inline - void grid_launch_hip_impl_( - New_grid_launch_tag, - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - const hc::accelerator_view& acc_v, - K k) - { - const auto d = hc::extent<3>{ - num_blocks.z * dim_blocks.z, - num_blocks.y * dim_blocks.y, - num_blocks.x * dim_blocks.x}.tile_with_dynamic( - dim_blocks.z, - dim_blocks.y, - dim_blocks.x, - group_mem_bytes); - - try { - hc::parallel_for_each(acc_v, d, k); - } - catch (std::exception& ex) { - std::cerr << "Failed in " << __func__ << ", with exception: " - << ex.what() << std::endl; - throw; - } - } - - // TODO: these are workarounds, they should be removed. - - hc::accelerator_view lock_stream_hip_(hipStream_t&, void*&); - void print_prelaunch_trace_(const char*, dim3, dim3, int, hipStream_t); - void unlock_stream_hip_( - hipStream_t, void*, const char*, hc::accelerator_view*); - - template - requires(Domain == {Ts...}) - inline - void grid_launch_hip_impl_( - New_grid_launch_tag, - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream, - const char* kernel_name, - K k) - { - void* lck_stream = nullptr; - auto acc_v = lock_stream_hip_(stream, lck_stream); - auto stream_guard = make_RAII_guard( - std::bind( - print_prelaunch_trace_, - kernel_name, - num_blocks, - dim_blocks, - group_mem_bytes, - stream), - std::bind( - unlock_stream_hip_, stream, lck_stream, kernel_name, &acc_v)); - - try { - grid_launch_hip_impl_( - New_grid_launch_tag{}, - std::move(num_blocks), - std::move(dim_blocks), - group_mem_bytes, - acc_v, - std::move(k)); - } - catch (std::exception& ex) { - std::cerr << "Failed in " << __func__ << ", with exception: " - << ex.what() << std::endl; - throw; - } - } - - template - requires(Domain == {hipLaunchParm, Ts...}) - inline - void grid_launch_hip_impl_( - Old_grid_launch_tag, - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream, - K k) - { - grid_launch_hip_impl_( - New_grid_launch_tag{}, - std::move(num_blocks), - std::move(dim_blocks), - group_mem_bytes, - std::move(stream), - std::move(k)); - } - - template - requires(Domain == {hipLaunchParm, Ts...}) - inline - void grid_launch_hip_impl_( - Old_grid_launch_tag, - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream, - const char* kernel_name, - K k) - { - grid_launch_hip_impl_( - New_grid_launch_tag{}, - std::move(num_blocks), - std::move(dim_blocks), - group_mem_bytes, - std::move(stream), - kernel_name, - std::move(k)); - } - - template - requires(Domain == {Ts...}) - inline - std::enable_if_t::value> grid_launch_hip_( - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream, - const char* kernel_name, - K k) - { - grid_launch_hip_impl_( - is_new_grid_launch_t{}, - std::move(num_blocks), - std::move(dim_blocks), - group_mem_bytes, - std::move(stream), - kernel_name, - std::move(k)); - } - - template - requires(Domain == {Ts...}) - inline - std::enable_if_t::value> grid_launch_hip_( - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream, - K k) - { - grid_launch_hip_impl_( - is_new_grid_launch_t{}, - std::move(num_blocks), - std::move(dim_blocks), - group_mem_bytes, - std::move(stream), - std::move(k)); - } - - // TODO: these are temporary and purposefully noisy and disruptive. - #define make_kernel_name_hip(k, n)\ - HIP_kernel_functor_name_begin ## _ ## k ## _ ## \ - HIP_kernel_functor_name_end ## _ ## n - - #define make_kernel_functor_hip_30(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ - p24, p25, p26, p27)\ - struct make_kernel_name_hip(function_name, 28) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - std::decay_t _p21_;\ - std::decay_t _p22_;\ - std::decay_t _p23_;\ - std::decay_t _p24_;\ - std::decay_t _p25_;\ - std::decay_t _p26_;\ - std::decay_t _p27_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_,\ - _p26_, _p27_);\ - }\ - } - #define make_kernel_functor_hip_29(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ - p24, p25, p26)\ - struct make_kernel_name_hip(function_name, 27) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - std::decay_t _p21_;\ - std::decay_t _p22_;\ - std::decay_t _p23_;\ - std::decay_t _p24_;\ - std::decay_t _p25_;\ - std::decay_t _p26_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_,\ - _p26_);\ - }\ - } - #define make_kernel_functor_hip_28(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ - p24, p25)\ - struct make_kernel_name_hip(function_name, 26) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - std::decay_t _p21_;\ - std::decay_t _p22_;\ - std::decay_t _p23_;\ - std::decay_t _p24_;\ - std::decay_t _p25_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_);\ - }\ - } - #define make_kernel_functor_hip_27(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ - p24)\ - struct make_kernel_name_hip(function_name, 25) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - std::decay_t _p21_;\ - std::decay_t _p22_;\ - std::decay_t _p23_;\ - std::decay_t _p24_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_);\ - }\ - } - #define make_kernel_functor_hip_26(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23)\ - struct make_kernel_name_hip(function_name, 24) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - std::decay_t _p21_;\ - std::decay_t _p22_;\ - std::decay_t _p23_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_, _p23_);\ - }\ - } - #define make_kernel_functor_hip_25(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22)\ - struct make_kernel_name_hip(function_name, 23) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - std::decay_t _p21_;\ - std::decay_t _p22_;\ - __attribute__((used, flatten))\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_);\ - }\ - } - #define make_kernel_functor_hip_24(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21)\ - struct make_kernel_name_hip(function_name, 22) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - std::decay_t _p21_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_);\ - }\ - } - #define make_kernel_functor_hip_23(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20)\ - struct make_kernel_name_hip(function_name, 21) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_);\ - }\ - } - #define make_kernel_functor_hip_22(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19)\ - struct make_kernel_name_hip(function_name, 20) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_);\ - }\ - } - #define make_kernel_functor_hip_21(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18)\ - struct make_kernel_name_hip(function_name, 19) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_);\ - }\ - } - #define make_kernel_functor_hip_20(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17)\ - struct make_kernel_name_hip(function_name, 18) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_);\ - }\ - } - #define make_kernel_functor_hip_19(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16)\ - struct make_kernel_name_hip(function_name, 17) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_);\ - }\ - } - #define make_kernel_functor_hip_18(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15)\ - struct make_kernel_name_hip(function_name, 16) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_);\ - }\ - } - #define make_kernel_functor_hip_17(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14)\ - struct make_kernel_name_hip(function_name, 15) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_);\ - }\ - } - #define make_kernel_functor_hip_16(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13)\ - struct make_kernel_name_hip(function_name, 14) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_);\ - }\ - } - #define make_kernel_functor_hip_15(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12)\ - struct make_kernel_name_hip(function_name, 13) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_);\ - }\ - } - #define make_kernel_functor_hip_14(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11)\ - struct make_kernel_name_hip(function_name, 12) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_);\ - }\ - } - #define make_kernel_functor_hip_13(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10)\ - struct make_kernel_name_hip(function_name, 11) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_);\ - }\ - } - #define make_kernel_functor_hip_12(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9)\ - struct make_kernel_name_hip(function_name, 10) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_);\ - }\ - } - #define make_kernel_functor_hip_11(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8)\ - struct make_kernel_name_hip(function_name, 9) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_);\ - }\ - } - #define make_kernel_functor_hip_10(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7)\ - struct make_kernel_name_hip(function_name, 8) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_);\ - }\ - } - #define make_kernel_functor_hip_9(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6)\ - struct make_kernel_name_hip(function_name, 7) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_);\ - }\ - } - #define make_kernel_functor_hip_8(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5)\ - struct make_kernel_name_hip(function_name, 6) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_);\ - }\ - } - #define make_kernel_functor_hip_7(\ - function_name, kernel_name, p0, p1, p2, p3, p4)\ - struct make_kernel_name_hip(function_name, 5) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_);\ - }\ - } - #define make_kernel_functor_hip_6(function_name, kernel_name, p0, p1, p2, p3)\ - struct make_kernel_name_hip(function_name, 4) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_, _p1_, _p2_, _p3_);\ - }\ - } - #define make_kernel_functor_hip_5(function_name, kernel_name, p0, p1, p2)\ - struct make_kernel_name_hip(function_name, 3) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_, _p1_, _p2_);\ - }\ - } - #define make_kernel_functor_hip_4(function_name, kernel_name, p0, p1)\ - struct make_kernel_name_hip(function_name, 2) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_, _p1_);\ - }\ - } - #define fofo(f, n) kernel_prefix_hip ## f ## kernel_suffix_hip ## n - #define make_kernel_functor_hip_3(function_name, kernel_name, p0)\ - struct make_kernel_name_hip(function_name, 1) {\ - std::decay_t _p0_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_);\ - }\ - } - #define make_kernel_functor_hip_2(function_name, kernel_name)\ - struct make_kernel_name_hip(function_name, 0) {\ - void operator()(const hc::tiled_index<3>&) [[hc]]\ - {\ - return kernel_name(hipLaunchParm{});\ - }\ - } - #define make_kernel_functor_hip_1(...) - #define make_kernel_functor_hip_0(...) - #define make_kernel_functor_hip_(...)\ - overload_macro_hip_(make_kernel_functor_hip_, __VA_ARGS__) - - - #define hipLaunchNamedKernelGGL(\ - function_name,\ - kernel_name,\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - ...)\ - do {\ - make_kernel_functor_hip_(function_name, kernel_name, __VA_ARGS__)\ - hip_kernel_functor_impl_{__VA_ARGS__};\ - hip_impl::grid_launch_hip_(\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - #kernel_name,\ - hip_kernel_functor_impl_);\ - } while(0) - - #define hipLaunchKernelGGL(\ - kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)\ - do {\ - hipLaunchNamedKernelGGL(\ - unnamed,\ - kernel_name,\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - ##__VA_ARGS__);\ - } while (0) - - #define hipLaunchKernel(\ - kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)\ - do {\ - hipLaunchKernelGGL(\ - kernel_name,\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - hipLaunchParm{},\ - ##__VA_ARGS__);\ - } while(0) -} -#endif //GENERIC_GRID_LAUNCH + #if __hcc_workweek__ >= 17481 + #include "functional_grid_launch.hpp" + #else + #include "macro_based_grid_launch.hpp" + #endif +#endif //GENERIC_GRID_LAUNCH \ No newline at end of file diff --git a/hipamd/include/hip/hcc_detail/host_defines.h b/hipamd/include/hip/hcc_detail/host_defines.h index b2e7ac2617..a7acdfccf7 100644 --- a/hipamd/include/hip/hcc_detail/host_defines.h +++ b/hipamd/include/hip/hcc_detail/host_defines.h @@ -44,7 +44,12 @@ THE SOFTWARE. #if GENERIC_GRID_LAUNCH == 0 #define __global__ __attribute__((hc_grid_launch)) __attribute__((used)) #else -#define __global__ __attribute__((annotate("hip__global__"), hc, used, weak)) + #if __hcc_workweek__ >= 17481 + #define __global__ \ + __attribute__((annotate("__HIP_global_function__"), cpu, hc, used)) + #else + #define __global__ __attribute__((hc, used)) + #endif #endif //GENERIC_GRID_LAUNCH #define __noinline__ __attribute__((noinline)) diff --git a/hipamd/include/hip/hcc_detail/macro_based_grid_launch.hpp b/hipamd/include/hip/hcc_detail/macro_based_grid_launch.hpp new file mode 100644 index 0000000000..f1dfe76245 --- /dev/null +++ b/hipamd/include/hip/hcc_detail/macro_based_grid_launch.hpp @@ -0,0 +1,1004 @@ +/* +Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#include "concepts.hpp" +#include "helpers.hpp" + +#include "hc.hpp" +#include "hip/hip_hcc.h" +#include "hip_runtime.h" + +#include +#include +#include +#include +#include + +namespace hip_impl +{ + namespace + { + struct New_grid_launch_tag {}; + struct Old_grid_launch_tag {}; + + template + class RAII_guard { + D dtor_; + public: + RAII_guard() = default; + + RAII_guard(const C& ctor, D dtor) : dtor_{std::move(dtor)} + { + ctor(); + } + + RAII_guard(const RAII_guard&) = default; + RAII_guard(RAII_guard&&) = default; + + RAII_guard& operator=(const RAII_guard&) = default; + RAII_guard& operator=(RAII_guard&&) = default; + + ~RAII_guard() { dtor_(); } + }; + + template + RAII_guard make_RAII_guard(const C& ctor, D dtor) + { + return RAII_guard{ctor, std::move(dtor)}; + } + + template + using is_new_grid_launch_t = typename std::conditional< + is_callable{}, + New_grid_launch_tag, + Old_grid_launch_tag>::type; + } + + // TODO: - dispatch rank should be derived from the domain dimensions passed + // in, and not always assumed to be 3; + + template + requires(Domain == {Ts...}) + inline + void grid_launch_hip_impl_( + New_grid_launch_tag, + dim3 num_blocks, + dim3 dim_blocks, + int group_mem_bytes, + const hc::accelerator_view& acc_v, + K k) + { + const auto d = hc::extent<3>{ + num_blocks.z * dim_blocks.z, + num_blocks.y * dim_blocks.y, + num_blocks.x * dim_blocks.x}.tile_with_dynamic( + dim_blocks.z, + dim_blocks.y, + dim_blocks.x, + group_mem_bytes); + + try { + hc::parallel_for_each(acc_v, d, k); + } + catch (std::exception& ex) { + std::cerr << "Failed in " << __func__ << ", with exception: " + << ex.what() << std::endl; + throw; + } + } + + // TODO: these are workarounds, they should be removed. + + hc::accelerator_view lock_stream_hip_(hipStream_t&, void*&); + void print_prelaunch_trace_(const char*, dim3, dim3, int, hipStream_t); + void unlock_stream_hip_( + hipStream_t, void*, const char*, hc::accelerator_view*); + + template + requires(Domain == {Ts...}) + inline + void grid_launch_hip_impl_( + New_grid_launch_tag, + dim3 num_blocks, + dim3 dim_blocks, + int group_mem_bytes, + hipStream_t stream, + const char* kernel_name, + K k) + { + void* lck_stream = nullptr; + auto acc_v = lock_stream_hip_(stream, lck_stream); + auto stream_guard = make_RAII_guard( + std::bind( + print_prelaunch_trace_, + kernel_name, + num_blocks, + dim_blocks, + group_mem_bytes, + stream), + std::bind( + unlock_stream_hip_, stream, lck_stream, kernel_name, &acc_v)); + + try { + grid_launch_hip_impl_( + New_grid_launch_tag{}, + std::move(num_blocks), + std::move(dim_blocks), + group_mem_bytes, + acc_v, + std::move(k)); + } + catch (std::exception& ex) { + std::cerr << "Failed in " << __func__ << ", with exception: " + << ex.what() << std::endl; + throw; + } + } + + template + requires(Domain == {hipLaunchParm, Ts...}) + inline + void grid_launch_hip_impl_( + Old_grid_launch_tag, + dim3 num_blocks, + dim3 dim_blocks, + int group_mem_bytes, + hipStream_t stream, + K k) + { + grid_launch_hip_impl_( + New_grid_launch_tag{}, + std::move(num_blocks), + std::move(dim_blocks), + group_mem_bytes, + std::move(stream), + std::move(k)); + } + + template + requires(Domain == {hipLaunchParm, Ts...}) + inline + void grid_launch_hip_impl_( + Old_grid_launch_tag, + dim3 num_blocks, + dim3 dim_blocks, + int group_mem_bytes, + hipStream_t stream, + const char* kernel_name, + K k) + { + grid_launch_hip_impl_( + New_grid_launch_tag{}, + std::move(num_blocks), + std::move(dim_blocks), + group_mem_bytes, + std::move(stream), + kernel_name, + std::move(k)); + } + + template + requires(Domain == {Ts...}) + inline + std::enable_if_t::value> grid_launch_hip_( + dim3 num_blocks, + dim3 dim_blocks, + int group_mem_bytes, + hipStream_t stream, + const char* kernel_name, + K k) + { + grid_launch_hip_impl_( + is_new_grid_launch_t{}, + std::move(num_blocks), + std::move(dim_blocks), + group_mem_bytes, + std::move(stream), + kernel_name, + std::move(k)); + } + + template + requires(Domain == {Ts...}) + inline + std::enable_if_t::value> grid_launch_hip_( + dim3 num_blocks, + dim3 dim_blocks, + int group_mem_bytes, + hipStream_t stream, + K k) + { + grid_launch_hip_impl_( + is_new_grid_launch_t{}, + std::move(num_blocks), + std::move(dim_blocks), + group_mem_bytes, + std::move(stream), + std::move(k)); + } + + // TODO: these are temporary and purposefully noisy and disruptive. + #define make_kernel_name_hip(k, n)\ + HIP_kernel_functor_name_begin ## _ ## k ## _ ## \ + HIP_kernel_functor_name_end ## _ ## n + + #define make_kernel_functor_hip_30(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ + p24, p25, p26, p27)\ + struct make_kernel_name_hip(function_name, 28) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + std::decay_t _p22_;\ + std::decay_t _p23_;\ + std::decay_t _p24_;\ + std::decay_t _p25_;\ + std::decay_t _p26_;\ + std::decay_t _p27_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_,\ + _p26_, _p27_);\ + }\ + } + #define make_kernel_functor_hip_29(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ + p24, p25, p26)\ + struct make_kernel_name_hip(function_name, 27) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + std::decay_t _p22_;\ + std::decay_t _p23_;\ + std::decay_t _p24_;\ + std::decay_t _p25_;\ + std::decay_t _p26_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_,\ + _p26_);\ + }\ + } + #define make_kernel_functor_hip_28(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ + p24, p25)\ + struct make_kernel_name_hip(function_name, 26) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + std::decay_t _p22_;\ + std::decay_t _p23_;\ + std::decay_t _p24_;\ + std::decay_t _p25_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_);\ + }\ + } + #define make_kernel_functor_hip_27(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ + p24)\ + struct make_kernel_name_hip(function_name, 25) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + std::decay_t _p22_;\ + std::decay_t _p23_;\ + std::decay_t _p24_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_);\ + }\ + } + #define make_kernel_functor_hip_26(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23)\ + struct make_kernel_name_hip(function_name, 24) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + std::decay_t _p22_;\ + std::decay_t _p23_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_, _p22_, _p23_);\ + }\ + } + #define make_kernel_functor_hip_25(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22)\ + struct make_kernel_name_hip(function_name, 23) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + std::decay_t _p22_;\ + __attribute__((used, flatten))\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_, _p22_);\ + }\ + } + #define make_kernel_functor_hip_24(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21)\ + struct make_kernel_name_hip(function_name, 22) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_);\ + }\ + } + #define make_kernel_functor_hip_23(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20)\ + struct make_kernel_name_hip(function_name, 21) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_);\ + }\ + } + #define make_kernel_functor_hip_22(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19)\ + struct make_kernel_name_hip(function_name, 20) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_);\ + }\ + } + #define make_kernel_functor_hip_21(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18)\ + struct make_kernel_name_hip(function_name, 19) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_);\ + }\ + } + #define make_kernel_functor_hip_20(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17)\ + struct make_kernel_name_hip(function_name, 18) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_);\ + }\ + } + #define make_kernel_functor_hip_19(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16)\ + struct make_kernel_name_hip(function_name, 17) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_);\ + }\ + } + #define make_kernel_functor_hip_18(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15)\ + struct make_kernel_name_hip(function_name, 16) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_);\ + }\ + } + #define make_kernel_functor_hip_17(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14)\ + struct make_kernel_name_hip(function_name, 15) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_);\ + }\ + } + #define make_kernel_functor_hip_16(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13)\ + struct make_kernel_name_hip(function_name, 14) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_);\ + }\ + } + #define make_kernel_functor_hip_15(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12)\ + struct make_kernel_name_hip(function_name, 13) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_);\ + }\ + } + #define make_kernel_functor_hip_14(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11)\ + struct make_kernel_name_hip(function_name, 12) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_);\ + }\ + } + #define make_kernel_functor_hip_13(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10)\ + struct make_kernel_name_hip(function_name, 11) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_);\ + }\ + } + #define make_kernel_functor_hip_12(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9)\ + struct make_kernel_name_hip(function_name, 10) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ + _p9_);\ + }\ + } + #define make_kernel_functor_hip_11(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8)\ + struct make_kernel_name_hip(function_name, 9) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_);\ + }\ + } + #define make_kernel_functor_hip_10(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7)\ + struct make_kernel_name_hip(function_name, 8) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_);\ + }\ + } + #define make_kernel_functor_hip_9(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6)\ + struct make_kernel_name_hip(function_name, 7) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_);\ + }\ + } + #define make_kernel_functor_hip_8(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5)\ + struct make_kernel_name_hip(function_name, 6) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_);\ + }\ + } + #define make_kernel_functor_hip_7(\ + function_name, kernel_name, p0, p1, p2, p3, p4)\ + struct make_kernel_name_hip(function_name, 5) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_);\ + }\ + } + #define make_kernel_functor_hip_6(function_name, kernel_name, p0, p1, p2, p3)\ + struct make_kernel_name_hip(function_name, 4) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_, _p3_);\ + }\ + } + #define make_kernel_functor_hip_5(function_name, kernel_name, p0, p1, p2)\ + struct make_kernel_name_hip(function_name, 3) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_);\ + }\ + } + #define make_kernel_functor_hip_4(function_name, kernel_name, p0, p1)\ + struct make_kernel_name_hip(function_name, 2) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_);\ + }\ + } + #define fofo(f, n) kernel_prefix_hip ## f ## kernel_suffix_hip ## n + #define make_kernel_functor_hip_3(function_name, kernel_name, p0)\ + struct make_kernel_name_hip(function_name, 1) {\ + std::decay_t _p0_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_);\ + }\ + } + #define make_kernel_functor_hip_2(function_name, kernel_name)\ + struct make_kernel_name_hip(function_name, 0) {\ + void operator()(const hc::tiled_index<3>&) [[hc]]\ + {\ + return kernel_name(hipLaunchParm{});\ + }\ + } + #define make_kernel_functor_hip_1(...) + #define make_kernel_functor_hip_0(...) + #define make_kernel_functor_hip_(...)\ + overload_macro_hip_(make_kernel_functor_hip_, __VA_ARGS__) + + + #define hipLaunchNamedKernelGGL(\ + function_name,\ + kernel_name,\ + num_blocks,\ + dim_blocks,\ + group_mem_bytes,\ + stream,\ + ...)\ + do {\ + make_kernel_functor_hip_(function_name, kernel_name, __VA_ARGS__)\ + hip_kernel_functor_impl_{__VA_ARGS__};\ + hip_impl::grid_launch_hip_(\ + num_blocks,\ + dim_blocks,\ + group_mem_bytes,\ + stream,\ + #kernel_name,\ + hip_kernel_functor_impl_);\ + } while(0) + + #define hipLaunchKernelGGL(\ + kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)\ + do {\ + hipLaunchNamedKernelGGL(\ + unnamed,\ + kernel_name,\ + num_blocks,\ + dim_blocks,\ + group_mem_bytes,\ + stream,\ + ##__VA_ARGS__);\ + } while (0) + + #define hipLaunchKernel(\ + kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)\ + do {\ + hipLaunchKernelGGL(\ + kernel_name,\ + num_blocks,\ + dim_blocks,\ + group_mem_bytes,\ + stream,\ + hipLaunchParm{},\ + ##__VA_ARGS__);\ + } while(0) +} \ No newline at end of file diff --git a/hipamd/include/hip/hcc_detail/program_state.hpp b/hipamd/include/hip/hcc_detail/program_state.hpp new file mode 100644 index 0000000000..65896e97a7 --- /dev/null +++ b/hipamd/include/hip/hcc_detail/program_state.hpp @@ -0,0 +1,84 @@ +/* +Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +struct ihipModuleSymbol_t; +using hipFunction_t = ihipModuleSymbol_t*; + +namespace std +{ + template<> + struct hash { + size_t operator()(hsa_agent_t x) const + { + return hash{}(x.handle); + } + }; +} + +inline +constexpr +bool operator==(hsa_agent_t x, hsa_agent_t y) +{ + return x.handle == y.handle; +} + +namespace hip_impl +{ + struct Kernel_descriptor { + std::uint64_t kernel_object_; + std::uint32_t group_size_; + std::uint32_t private_size_; + std::string name_; + + operator hipFunction_t() const + { // TODO: this is awful and only meant for illustration. + return reinterpret_cast( + const_cast(this)); + } + }; + + using RAII_global = std::unique_ptr; + + const std::unordered_map< + hsa_agent_t, std::vector>& executables(); + const std::unordered_map< + std::uintptr_t, + std::vector>>& functions(); + const std::unordered_map& function_names(); + std::unordered_map& globals(); + + hsa_executable_t load_executable( + hsa_executable_t executable, hsa_agent_t agent, std::istream& file); +} // Namespace hip_impl. \ No newline at end of file diff --git a/hipamd/src/code_object_bundle.cpp b/hipamd/src/code_object_bundle.cpp new file mode 100644 index 0000000000..d7d2cd1e10 --- /dev/null +++ b/hipamd/src/code_object_bundle.cpp @@ -0,0 +1,39 @@ +#include "../include/hip/hcc_detail/code_object_bundle.hpp" + +#include + +#include +#include +#include + +hsa_isa_t hip_impl::triple_to_hsa_isa(const std::string& triple) +{ + static constexpr const char prefix[] = "hcc-amdgcn--amdhsa-gfx"; + static constexpr std::size_t prefix_sz = sizeof(prefix) - 1; + + hsa_isa_t r = {}; + + auto idx = triple.find(prefix); + + if (idx != std::string::npos) { + idx += prefix_sz; + std::string tmp = "AMD:AMDGPU"; + while (idx != triple.size()) { + tmp.push_back(':'); + tmp.push_back(triple[idx++]); + } + + hsa_isa_from_name(tmp.c_str(), &r); + } + + return r; +} + +// DATA - STATICS +constexpr const char hip_impl::Bundled_code_header::magic_string_[]; + +// CREATORS +hip_impl::Bundled_code_header::Bundled_code_header( + const std::vector& x) + : Bundled_code_header{x.cbegin(), x.cend()} +{} \ No newline at end of file diff --git a/hipamd/src/device_functions.cpp b/hipamd/src/device_functions.cpp index 3c9bf334fa..396b5b0594 100644 --- a/hipamd/src/device_functions.cpp +++ b/hipamd/src/device_functions.cpp @@ -28,27 +28,6 @@ extern "C" float __ocml_rint_f32(float); extern "C" float __ocml_ceil_f32(float); extern "C" float __ocml_trunc_f32(float); -struct holder64Bit{ - union{ - double d; - unsigned long int uli; - signed long int sli; - signed int si[2]; - unsigned int ui[2]; - }; -} __attribute__((aligned(8))); - -struct holder32Bit { - union { - float f; - unsigned int ui; - signed int si; - }; -} __attribute__((aligned(4))); - -__device__ struct holder64Bit hold64; -__device__ struct holder32Bit hold32; - __device__ float __double2float_rd(double x) { return (double)x; @@ -69,13 +48,21 @@ __device__ float __double2float_rz(double x) __device__ int __double2hiint(double x) { - hold64.d = x; - return hold64.si[1]; + static_assert(sizeof(double) == 2 * sizeof(int), ""); + + int tmp[2]; + __builtin_memcpy(tmp, &x, sizeof(tmp)); + + return tmp[1]; } __device__ int __double2loint(double x) { - hold64.d = x; - return hold64.si[0]; + static_assert(sizeof(double) == 2 * sizeof(int), ""); + + int tmp[2]; + __builtin_memcpy(tmp, &x, sizeof(tmp)); + + return tmp[0]; } @@ -150,8 +137,12 @@ __device__ unsigned long long int __double2ull_rz(double x) __device__ long long int __double_as_longlong(double x) { - hold64.d = x; - return hold64.sli; + static_assert(sizeof(long long) == sizeof(double), ""); + + long long tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return tmp; } __device__ int __float2int_rd(float x) @@ -224,19 +215,32 @@ __device__ unsigned long long int __float2ull_rz(float x) __device__ int __float_as_int(float x) { - hold32.f = x; - return hold32.si; + static_assert(sizeof(int) == sizeof(float), ""); + + int tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return tmp; } __device__ unsigned int __float_as_uint(float x) { - hold32.f = x; - return hold32.ui; + static_assert(sizeof(unsigned int) == sizeof(float), ""); + + unsigned int tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return tmp; } -__device__ double __hiloint2double(int hi, int lo) +__device__ double __hiloint2double(int32_t hi, int32_t lo) { - hold64.si[1] = hi; - hold64.si[0] = lo; - return hold64.d; + static_assert(sizeof(double) == sizeof(uint64_t), ""); + + uint64_t tmp0 = + (static_cast(hi) << 32ull) | static_cast(lo); + double tmp1; + __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + + return tmp1; } __device__ double __int2double_rn(int x) { @@ -262,8 +266,12 @@ __device__ float __int2float_rz(int x) __device__ float __int_as_float(int x) { - hold32.si = x; - return hold32.f; + static_assert(sizeof(float) == sizeof(int), ""); + + float tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return tmp; } __device__ double __ll2double_rd(long long int x) @@ -302,8 +310,12 @@ __device__ float __ll2float_rz(long long int x) __device__ double __longlong_as_double(long long int x) { - hold64.sli = x; - return hold64.d; + static_assert(sizeof(double) == sizeof(long long), ""); + + double tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return x; } __device__ double __uint2double_rn(int x) @@ -330,8 +342,12 @@ __device__ float __uint2float_rz(unsigned int x) __device__ float __uint_as_float(unsigned int x) { - hold32.ui = x; - return hold32.f; + static_assert(sizeof(float) == sizeof(unsigned int), ""); + + float tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return tmp; } __device__ double __ull2double_rd(unsigned long long int x) diff --git a/hipamd/src/device_util.cpp b/hipamd/src/device_util.cpp index 367a4c1a4f..b6aebdfce0 100644 --- a/hipamd/src/device_util.cpp +++ b/hipamd/src/device_util.cpp @@ -102,23 +102,48 @@ __device__ void* __hip_hc_free(void *ptr) // loop unrolling __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) { - uint8_t *dstPtr, *srcPtr; - dstPtr = (uint8_t*)dst; - srcPtr = (uint8_t*)src; - for(uint32_t i=0;i(dst); + auto srcPtr = static_cast(src); + + while (size >= 4u) { + dstPtr[0] = srcPtr[0]; + dstPtr[1] = srcPtr[1]; + dstPtr[2] = srcPtr[2]; + dstPtr[3] = srcPtr[3]; + + size -= 4u; + srcPtr += 4u; + dstPtr += 4u; } - return nullptr; + switch (size) { + case 3: dstPtr[2] = srcPtr[2]; + case 2: dstPtr[1] = srcPtr[1]; + case 1: dstPtr[0] = srcPtr[0]; + } + + return dst; } -__device__ void* __hip_hc_memset(void* ptr, uint8_t val, size_t size) +__device__ void* __hip_hc_memset(void* dst, uint8_t val, size_t size) { - uint8_t *dstPtr; - dstPtr = (uint8_t*)ptr; - for(uint32_t i=0;i(dst); + + while (size >= 4u) { + dstPtr[0] = val; + dstPtr[1] = val; + dstPtr[2] = val; + dstPtr[3] = val; + + size -= 4u; + dstPtr += 4u; } - return nullptr; + switch (size) { + case 3: dstPtr[2] = val; + case 2: dstPtr[1] = val; + case 1: dstPtr[0] = val; + } + + return dst; } __device__ float __hip_erfinvf(float x){ diff --git a/hipamd/src/functional_grid_launch.inl b/hipamd/src/functional_grid_launch.inl new file mode 100644 index 0000000000..b555967ebc --- /dev/null +++ b/hipamd/src/functional_grid_launch.inl @@ -0,0 +1,137 @@ +/* +Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "hip/hcc_detail/program_state.hpp" + +#include "hip/hip_runtime_api.h" + +// Internal header, do not percolate upwards. +#include "hip_hcc_internal.h" +#include "hc.hpp" +#include "trace_helper.h" + +#include +#include +#include +#include + +#include + +using namespace hc; +using namespace std; + +namespace hip_impl +{ + namespace + { + inline + string name(uintptr_t function_address) + { + const auto it = function_names().find(function_address); + + if (it == function_names().cend()) { + throw runtime_error{ + "Invalid function passed to hipLaunchKernelGGL."}; + } + + return it->second; + } + + inline + string name(hsa_agent_t agent) + { + char n[64] = {}; + hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, n); + + return string{n}; + } + + inline + hsa_agent_t target_agent(hipStream_t stream) + { + if (stream) { + return *static_cast( + stream->locked_getAv()->get_hsa_agent()); + } + else if ( + ihipGetTlsDefaultCtx() && ihipGetTlsDefaultCtx()->getDevice()) { + return ihipGetDevice( + ihipGetTlsDefaultCtx()->getDevice()->_deviceId)->_hsaAgent; + } + else { + return *static_cast( + accelerator{}.get_default_view().get_hsa_agent()); + } + } + } + + void hipLaunchKernelGGLImpl( + uintptr_t function_address, + const dim3& numBlocks, + const dim3& dimBlocks, + uint32_t sharedMemBytes, + hipStream_t stream, + void** kernarg) + { + const auto it0 = functions().find(function_address); + + if (it0 == functions().cend()) { + throw runtime_error{ + "No device code available for function: " + + name(function_address) + }; + } + + auto agent = target_agent(stream); + + const auto it1 = find_if( + it0->second.cbegin(), + it0->second.cend(), + [=](const pair& x) { + return x.first.handle == agent.handle; + }); + + if (it1 == it0->second.cend()) { + throw runtime_error{ + "No code available for function: " + name(function_address) + + ", for agent: " + name(agent) + }; + } + + for (auto&& agent_kernel : it0->second) { + if (agent.handle == agent_kernel.first.handle) { + hipModuleLaunchKernel( + agent_kernel.second, + numBlocks.x, + numBlocks.y, + numBlocks.z, + dimBlocks.x, + dimBlocks.y, + dimBlocks.z, + sharedMemBytes, + stream, + nullptr, + kernarg); + } + } + } +} diff --git a/hipamd/src/grid_launch.cpp b/hipamd/src/grid_launch.cpp index 9b3cf509c5..1fe47c189a 100644 --- a/hipamd/src/grid_launch.cpp +++ b/hipamd/src/grid_launch.cpp @@ -22,78 +22,8 @@ THE SOFTWARE. #include "hip/hcc_detail/grid_launch_GGL.hpp" -// Internal header, do not percolate upwards. -#include "hip_hcc_internal.h" -#include "hc.hpp" -#include "trace_helper.h" - -#include -#include - -namespace hip_impl -{ - hc::accelerator_view lock_stream_hip_( - hipStream_t& stream, void*& locked_stream) - { // This allocated but does not take ownership of locked_stream. If it is - // not deleted elsewhere it will leak. - using L = decltype(stream->lockopen_preKernelCommand()); - - HIP_INIT(); - - stream = ihipSyncAndResolveStream(stream); - locked_stream = new L{stream->lockopen_preKernelCommand()}; - return (*static_cast(locked_stream))->_av; - } - - void print_prelaunch_trace_( - const char* kernel_name, - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream) - { - if ((HIP_TRACE_API & (1 << TRACE_KCMD)) || - HIP_PROFILE_API || - (COMPILE_HIP_DB && (HIP_TRACE_API & (1<lockopen_preKernelCommand()); - - stream->lockclose_postKernelCommand(kernel_name, acc_v); - - delete static_cast(locked_stream); - locked_stream = nullptr; - if(HIP_PROFILE_API) { - MARKER_END(); - } - } -} +#if __hcc_workweek__ >= 17481 + #include "functional_grid_launch.inl" +#else + #include "macro_based_grid_launch.inl" +#endif \ No newline at end of file diff --git a/hipamd/src/hip_hcc_internal.h b/hipamd/src/hip_hcc_internal.h index e7650260a9..3fd09630d9 100644 --- a/hipamd/src/hip_hcc_internal.h +++ b/hipamd/src/hip_hcc_internal.h @@ -248,7 +248,7 @@ static const DbName dbName [] = #if COMPILE_HIP_DB #define tprintf(trace_level, ...) {\ if (HIP_DB & (1<<(trace_level))) {\ - char msgStr[1000];\ + char msgStr[2000];\ snprintf(msgStr, 2000, __VA_ARGS__);\ fprintf (stderr, " %ship-%s tid:%d:%s%s", dbName[trace_level]._color, dbName[trace_level]._shortName, tls_tidInfo.tid(), msgStr, KNRM); \ }\ diff --git a/hipamd/src/hip_memory.cpp b/hipamd/src/hip_memory.cpp index 75c054f5e1..fb25101d7e 100644 --- a/hipamd/src/hip_memory.cpp +++ b/hipamd/src/hip_memory.cpp @@ -808,6 +808,26 @@ hipError_t hipHostUnregister(void *hostPtr) return ihipLogStatus(hip_status); } +namespace +{ + inline + hipDeviceptr_t agent_address_for_symbol(const char* symbolName) + { + hipDeviceptr_t r = nullptr; + + #if __hcc_workweek__ >= 17481 + size_t byte_cnt = 0u; + hipModuleGetGlobal(&r, &byte_cnt, 0, symbolName); + #else + auto ctx = ihipGetTlsDefaultCtx(); + auto acc = ctx->getDevice()->_acc; + r = acc.get_symbol_address(symbolName); + #endif + + return r; + } +} + hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t count, size_t offset, hipMemcpyKind kind) { HIP_INIT_SPECIAL_API((TRACE_MCMD), symbolName, src, count, offset, kind); @@ -821,7 +841,8 @@ hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t cou hc::accelerator acc = ctx->getDevice()->_acc; - void *dst = acc.get_symbol_address((const char*) symbolName); + hipDeviceptr_t dst = + agent_address_for_symbol(static_cast(symbolName)); tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst); if(dst == nullptr) @@ -856,7 +877,8 @@ hipError_t hipMemcpyFromSymbol(void* dst, const void* symbolName, size_t count, hc::accelerator acc = ctx->getDevice()->_acc; - void *src = acc.get_symbol_address((const char*) symbolName); + hipDeviceptr_t src = + agent_address_for_symbol(static_cast(symbolName)); tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst); if(dst == nullptr) @@ -893,7 +915,8 @@ hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void *src, size_ hc::accelerator acc = ctx->getDevice()->_acc; - void *dst = acc.get_symbol_address((const char*) symbolName); + hipDeviceptr_t dst = + agent_address_for_symbol(static_cast(symbolName)); tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst); if(dst == nullptr) @@ -931,7 +954,8 @@ hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName, size_t co hc::accelerator acc = ctx->getDevice()->_acc; - void *src = acc.get_symbol_address((const char*) symbolName); + hipDeviceptr_t src = + agent_address_for_symbol(static_cast(symbolName)); tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, src); if(src == nullptr || dst == nullptr) @@ -1285,42 +1309,59 @@ hipError_t hipMemcpy3D(const struct hipMemcpy3DParms *p) return ihipLogStatus(e); } -// TODO - make member function of stream? +namespace +{ + template< + uint32_t block_dim, + typename RandomAccessIterator, + typename N, + typename T> + __global__ + void hip_fill_n(RandomAccessIterator f, N n, T value) + { + const uint32_t grid_dim = gridDim.x * blockDim.x; + + size_t idx = blockIdx.x * block_dim + threadIdx.x; + while (idx < n) { + __builtin_memcpy( + reinterpret_cast(&f[idx]), + reinterpret_cast(&value), + sizeof(T)); + idx += grid_dim; + } + } + + template< + typename T, + typename std::enable_if{}>::type* = nullptr> + inline + const T& clamp_integer(const T& x, const T& lower, const T& upper) + { + assert(!(upper < lower)); + + return std::min(upper, std::max(x, lower)); + } +} + template void ihipMemsetKernel(hipStream_t stream, - LockedAccessor_StreamCrit_t &crit, - T * ptr, T val, size_t sizeBytes, - hc::completion_future *cf) + T * ptr, T val, size_t sizeBytes) { - int wg = std::min((unsigned)8, stream->getDevice()->_computeUnits); - const int threads_per_wg = 256; + static constexpr uint32_t block_dim = 256; - int threads = wg * threads_per_wg; - if (threads > sizeBytes) { - threads = ((sizeBytes + threads_per_wg - 1) / threads_per_wg) * threads_per_wg; - } - - - hc::extent<1> ext(threads); - auto ext_tile = ext.tile(threads_per_wg); - - *cf = - hc::parallel_for_each( - crit->_av, - ext_tile, - [=] (hc::tiled_index<1> idx) - __attribute__((hc)) - { - int offset = amp_get_global_id(0); - // TODO-HCC - change to hc_get_local_size() - int stride = amp_get_local_size(0) * hc_get_num_groups(0) ; - - for (int i=offset; i( + sizeBytes / block_dim, 1, UINT32_MAX); + hipLaunchKernelGGL( + hip_fill_n, + dim3(grid_dim), + dim3{block_dim}, + 0u, + stream, + ptr, + sizeBytes, + std::move(val)); } @@ -1334,17 +1375,12 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s stream = ihipSyncAndResolveStream(stream); if (stream) { - auto crit = stream->lockopen_preKernelCommand(); - - - hc::completion_future cf ; - if ((sizeBytes & 0x3) == 0) { // use a faster dword-per-workitem copy: try { value = value & 0xff; uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ; - ihipMemsetKernel (stream, crit, static_cast (dst), value32, sizeBytes/sizeof(uint32_t), &cf); + ihipMemsetKernel (stream, static_cast (dst), value32, sizeBytes/sizeof(uint32_t)); } catch (std::exception &ex) { e = hipErrorInvalidValue; @@ -1352,19 +1388,16 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s } else { // use a slow byte-per-workitem copy: try { - ihipMemsetKernel (stream, crit, static_cast (dst), value, sizeBytes, &cf); + ihipMemsetKernel (stream, static_cast (dst), value, sizeBytes); } catch (std::exception &ex) { e = hipErrorInvalidValue; } } - stream->lockclose_postKernelCommand("hipMemsetAsync", &crit->_av); - - if (HIP_API_BLOCKING) { tprintf (DB_SYNC, "%s LAUNCH_BLOCKING wait for hipMemsetAsync.\n", ToString(stream).c_str()); - cf.wait(); + stream->locked_wait(); } } else { e = hipErrorInvalidValue; @@ -1385,16 +1418,12 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes) stream = ihipSyncAndResolveStream(stream); if (stream) { - auto crit = stream->lockopen_preKernelCommand(); - - hc::completion_future cf ; - if ((sizeBytes & 0x3) == 0) { // use a faster dword-per-workitem copy: try { value = value & 0xff; uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ; - ihipMemsetKernel (stream, crit, static_cast (dst), value32, sizeBytes/sizeof(uint32_t), &cf); + ihipMemsetKernel (stream, static_cast (dst), value32, sizeBytes/sizeof(uint32_t)); } catch (std::exception &ex) { e = hipErrorInvalidValue; @@ -1402,21 +1431,18 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes) } else { // use a slow byte-per-workitem copy: try { - ihipMemsetKernel (stream, crit, static_cast (dst), value, sizeBytes, &cf); + ihipMemsetKernel (stream, static_cast (dst), value, sizeBytes); } catch (std::exception &ex) { e = hipErrorInvalidValue; } } // TODO - is hipMemset supposed to be async? - cf.wait(); - - stream->lockclose_postKernelCommand("hipMemset", &crit->_av); - + stream->locked_wait(); if (HIP_LAUNCH_BLOCKING) { tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset in %s.\n", __func__, ToString(stream).c_str()); - cf.wait(); + stream->locked_wait(); tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed in %s.\n", __func__, ToString(stream).c_str()); } } else { @@ -1437,17 +1463,13 @@ hipError_t hipMemset2D(void* dst, size_t pitch, int value, size_t width, size_t stream = ihipSyncAndResolveStream(stream); if (stream) { - auto crit = stream->lockopen_preKernelCommand(); - - hc::completion_future cf ; - size_t sizeBytes = pitch * height; if ((sizeBytes & 0x3) == 0) { // use a faster dword-per-workitem copy: try { value = value & 0xff; uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ; - ihipMemsetKernel (stream, crit, static_cast (dst), value32, sizeBytes/sizeof(uint32_t), &cf); + ihipMemsetKernel (stream, static_cast (dst), value32, sizeBytes/sizeof(uint32_t)); } catch (std::exception &ex) { e = hipErrorInvalidValue; @@ -1455,20 +1477,18 @@ hipError_t hipMemset2D(void* dst, size_t pitch, int value, size_t width, size_t } else { // use a slow byte-per-workitem copy: try { - ihipMemsetKernel (stream, crit, static_cast (dst), value, sizeBytes, &cf); + ihipMemsetKernel (stream, static_cast (dst), value, sizeBytes); } catch (std::exception &ex) { e = hipErrorInvalidValue; } } // TODO - is hipMemset supposed to be async? - cf.wait(); - - stream->lockclose_postKernelCommand("hipMemset", &crit->_av); + stream->locked_wait(); if (HIP_LAUNCH_BLOCKING) { tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset in %s.\n", __func__, ToString(stream).c_str()); - cf.wait(); + stream->locked_wait(); tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed in %s.\n", __func__, ToString(stream).c_str()); } } else { @@ -1489,36 +1509,30 @@ hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t sizeByte stream = ihipSyncAndResolveStream(stream); if (stream) { - auto crit = stream->lockopen_preKernelCommand(); - - hc::completion_future cf ; - if ((sizeBytes & 0x3) == 0) { // use a faster dword-per-workitem copy: try { uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ; - ihipMemsetKernel (stream, crit, static_cast (dst), value32, sizeBytes/sizeof(uint32_t), &cf); + ihipMemsetKernel (stream, static_cast (dst), value32, sizeBytes/sizeof(uint32_t)); } catch (std::exception &ex) { + std::cout << ex.what() << std::endl; e = hipErrorInvalidValue; } } else { // use a slow byte-per-workitem copy: try { - ihipMemsetKernel (stream, crit, static_cast (dst), value, sizeBytes, &cf); + ihipMemsetKernel (stream, static_cast (dst), value, sizeBytes); } catch (std::exception &ex) { e = hipErrorInvalidValue; } } - cf.wait(); - - stream->lockclose_postKernelCommand("hipMemsetD8", &crit->_av); - + stream->locked_wait(); if (HIP_LAUNCH_BLOCKING) { tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset in %s.\n", __func__, ToString(stream).c_str()); - cf.wait(); + stream->locked_wait(); tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed in %s.\n", __func__, ToString(stream).c_str()); } } else { @@ -1717,7 +1731,7 @@ hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* devPtr){ HIP_INIT_API ( handle, devPtr); hipError_t hipStatus = hipSuccess; // Get the size of allocated pointer - size_t psize; + size_t psize = 0u; hc::accelerator acc; if((handle == NULL) || (devPtr == NULL)) { hipStatus = hipErrorInvalidResourceHandle; diff --git a/hipamd/src/hip_module.cpp b/hipamd/src/hip_module.cpp index f4e4b01ac2..1477247ae2 100644 --- a/hipamd/src/hip_module.cpp +++ b/hipamd/src/hip_module.cpp @@ -37,6 +37,7 @@ THE SOFTWARE. #include "elfio/elfio.hpp" #include "hip/hip_runtime.h" +#include "hip/hcc_detail/program_state.hpp" #include "hip_hcc_internal.h" #include "trace_helper.h" @@ -165,159 +166,8 @@ uint64_t ElfSize(const void *emi){ return total_size; } -namespace -{ - template - inline - ELFIO::section* find_section_if(ELFIO::elfio& reader, P p) - { - using namespace std; - - const auto it = find_if( - reader.sections.begin(), reader.sections.end(), move(p)); - - return it != reader.sections.end() ? *it : nullptr; - } - - inline - std::vector copy_names_of_undefined_symbols( - const ELFIO::symbol_section_accessor& section) - { - using namespace ELFIO; - using namespace std; - - vector r; - - for (auto i = 0u; i != section.get_symbols_num(); ++i) { - // TODO: this is boyscout code, caching the temporaries - // may be of worth. - string name; - Elf64_Addr value = 0; - Elf_Xword size = 0; - Elf_Half sect_idx = 0; - uint8_t bind = 0; - uint8_t type = 0; - uint8_t other = 0; - - section.get_symbol( - i, name, value, size, bind, type, sect_idx, other); - - if (sect_idx == SHN_UNDEF && !name.empty()) { - r.push_back(std::move(name)); - } - } - - return r; - } - - inline - std::pair find_symbol_address( - const ELFIO::symbol_section_accessor& section, - const std::string& symbol_name) - { - using namespace ELFIO; - using namespace std; - - static const pair r{0, 0}; - - for (auto i = 0u; i != section.get_symbols_num(); ++i) { - // TODO: this is boyscout code, caching the temporaries - // may be of worth. - string name; - Elf64_Addr value = 0; - Elf_Xword size = 0; - Elf_Half sect_idx = 0; - uint8_t bind = 0; - uint8_t type = 0; - uint8_t other = 0; - - section.get_symbol( - i, name, value, size, bind, type, sect_idx, other); - - if (name == symbol_name) return make_pair(value, size); - } - - return r; - } - - inline - void associate_code_object_symbols_with_host_allocation( - hipModule_t module, - const ELFIO::elfio& reader, - const ELFIO::elfio& self_reader, - ELFIO::section* code_object_dynsym, - ELFIO::section* process_symtab, - hsa_agent_t agent, - hsa_executable_t executable) - { - using namespace ELFIO; - using namespace std; - - if (!code_object_dynsym || !process_symtab) return; - - const auto undefined_symbols = copy_names_of_undefined_symbols( - symbol_section_accessor{reader, code_object_dynsym}); - - for (auto&& x : undefined_symbols) { - const auto tmp = find_symbol_address( - symbol_section_accessor{self_reader, process_symtab}, x); - - assert(tmp.first); - void* p = nullptr; - hsa_amd_memory_lock( - reinterpret_cast(tmp.first), tmp.second, &agent, 1, &p); - - hsa_executable_agent_global_variable_define( - executable, agent, x.c_str(), p); - - static vector< - unique_ptr> globals; - static mutex mtx; - - lock_guard lck{mtx}; - globals.emplace_back(p, hsa_amd_memory_unlock); - if (module->coGlobals.count(x) == 0) { - module->coGlobals.emplace(x, tmp.first); - } - } - } - - inline - void load_code_object_and_freeze_executable( - const char* file, hsa_agent_t agent, hsa_executable_t executable) - { // TODO: the following sequence is inefficient, should be refactored - // into a single load of the file and subsequent ELFIO - // processing. - using namespace std; - - static const auto cor_deleter = [](hsa_code_object_reader_t* p) { - hsa_code_object_reader_destroy(*p); - }; - - using RAII_code_reader = unique_ptr< - hsa_code_object_reader_t, decltype(cor_deleter)>; - - unique_ptr cobj{fopen(file, "r"), fclose}; - RAII_code_reader tmp{new hsa_code_object_reader_t, cor_deleter}; - hsa_code_object_reader_create_from_file(fileno(cobj.get()), tmp.get()); - - hsa_executable_load_agent_code_object( - executable, agent, *tmp, nullptr, nullptr); - - hsa_executable_freeze(executable, nullptr); - - static vector code_readers; - static mutex mtx; - - lock_guard lck{mtx}; - code_readers.push_back(move(tmp)); - } -} - hipError_t hipModuleLoad(hipModule_t *module, const char *fname) { - using namespace ELFIO; - HIP_INIT_API(module, fname); hipError_t ret = hipSuccess; *module = new ihipModule_t; @@ -340,37 +190,14 @@ hipError_t hipModuleLoad(hipModule_t *module, const char *fname) nullptr, &(*module)->executable); - elfio reader; - if (!reader.load(fname)) { + std::ifstream file{fname}; + + if (!file.is_open()) { return ihipLogStatus(hipErrorFileNotFound); } - else { - // TODO: this may benefit from caching as well. - elfio self_reader; - self_reader.load("/proc/self/exe"); - - const auto symtab = - find_section_if(self_reader, [](const ELFIO::section* x) { - return x->get_type() == SHT_SYMTAB; - }); - - const auto code_object_dynsym = - find_section_if(reader, [](const ELFIO::section* x) { - return x->get_type() == SHT_DYNSYM; - }); - - associate_code_object_symbols_with_host_allocation( - *module, - reader, - self_reader, - code_object_dynsym, - symtab, - currentDevice->_hsaAgent, - (*module)->executable); - - load_code_object_and_freeze_executable( - fname, currentDevice->_hsaAgent, (*module)->executable); - } + (*module)->executable = hip_impl::load_executable( + (*module)->executable, currentDevice->_hsaAgent, file); + ret = (*module)->executable.handle ? hipSuccess : hipErrorUnknown; } return ihipLogStatus(ret); @@ -725,16 +552,93 @@ namespace } inline - std::vector read_agent_globals(hipModule_t hmodule) + std::vector read_agent_globals( + hsa_agent_t agent, hsa_executable_t executable) { std::vector r; - hsa_executable_iterate_agent_symbols( - hmodule->executable, this_agent(), copy_agent_global_variables, &r); + executable, agent, copy_agent_global_variables, &r); return r; } + + template + std::pair read_global_description( + ForwardIterator f, ForwardIterator l, const char* name) + { + const auto it = std::find_if( + f, l, [=](const Agent_global& x) { return x.name == name; }); + + return it == l ? + std::make_pair(nullptr, 0u) : + std::make_pair(it->address, it->byte_cnt); + } + + hipError_t read_agent_global_from_module( + hipDeviceptr_t *dptr, + size_t* bytes, + hipModule_t hmod, + const char* name) + { + static std::unordered_map< + hipModule_t, std::vector> agent_globals; + + // TODO: this is not particularly robust. + if (agent_globals.count(hmod) == 0) { + static std::mutex mtx; + std::lock_guard lck{mtx}; + + if (agent_globals.count(hmod) == 0) { + agent_globals.emplace( + hmod, read_agent_globals(this_agent(), hmod->executable)); + } + } + + // TODO: This is unsafe iff some other emplacement triggers rehashing. + // It will have to be properly fleshed out in the future. + const auto it0 = agent_globals.find(hmod); + if (it0 == agent_globals.cend()) { + throw std::runtime_error{"agent_globals data structure corrupted."}; + } + + std::tie(*dptr, *bytes) = read_global_description( + it0->second.cbegin(), it0->second.cend(), name); + + return dptr ? hipSuccess : hipErrorNotFound; + } + + hipError_t read_agent_global_from_process( + hipDeviceptr_t *dptr, size_t* bytes, const char* name) + { + static std::unordered_map< + hsa_agent_t, std::vector> agent_globals; + static std::once_flag f; + + std::call_once(f, []() { + for (auto&& agent_executables : hip_impl::executables()) { + std::vector tmp0; + for (auto&& executable : agent_executables.second) { + auto tmp1 = read_agent_globals( + agent_executables.first, executable); + tmp0.insert( + tmp0.end(), + std::make_move_iterator(tmp1.begin()), + std::make_move_iterator(tmp1.end())); + } + agent_globals.emplace(agent_executables.first, std::move(tmp0)); + } + }); + + const auto it = agent_globals.find(this_agent()); + + if (it == agent_globals.cend()) return hipErrorNotInitialized; + + std::tie(*dptr, *bytes) = read_global_description( + it->second.cbegin(), it->second.cend(), name); + + return dptr ? hipSuccess : hipErrorNotFound; + } } hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, @@ -745,41 +649,15 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, if(dptr == NULL || bytes == NULL){ return ihipLogStatus(hipErrorInvalidValue); } - if(name == NULL || hmod == NULL){ + if(name == NULL){ return ihipLogStatus(hipErrorNotInitialized); } else{ - static std::unordered_map< - hipModule_t, std::vector> agent_globals; + ret = hmod ? + read_agent_global_from_module(dptr, bytes, hmod, name) : + read_agent_global_from_process(dptr, bytes, name); - // TODO: this is not particularly robust. - if (agent_globals.count(hmod) == 0) { - static std::mutex mtx; - std::lock_guard lck{mtx}; - - if (agent_globals.count(hmod) == 0) { - agent_globals.emplace(hmod, read_agent_globals(hmod)); - } - } - - // TODO: This is unsafe iff some other emplacement triggers rehashing. - // It will have to be properly fleshed out in the future. - const auto it0 = agent_globals.find(hmod); - if (it0 == agent_globals.cend()) { - throw std::runtime_error{"agent_globals data structure corrupted."}; - } - - const auto it1 = std::find_if( - it0->second.cbegin(), - it0->second.cend(), - [=](const Agent_global& x) { return x.name == name; }); - - if (it1 == it0->second.cend()) return ihipLogStatus(hipErrorNotFound); - - *dptr = it1->address; - *bytes = it1->byte_cnt; - - return ihipLogStatus(hipSuccess); + return ihipLogStatus(ret); } } @@ -848,9 +726,9 @@ hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const if(name == NULL || hmod == NULL){ ret = hipErrorNotInitialized; } else{ - const auto it = hmod->coGlobals.find(name); - if (it == hmod->coGlobals.end()) return ihipLogStatus(hipErrorInvalidValue); - *texRef = reinterpret_cast(it->second); + const auto it = hip_impl::globals().find(name); + if (it == hip_impl::globals().end()) return ihipLogStatus(hipErrorInvalidValue); + *texRef = reinterpret_cast(it->second.get()); ret = hipSuccess; } } diff --git a/hipamd/src/macro_based_grid_launch.inl b/hipamd/src/macro_based_grid_launch.inl new file mode 100644 index 0000000000..5547d3a71a --- /dev/null +++ b/hipamd/src/macro_based_grid_launch.inl @@ -0,0 +1,97 @@ +/* +Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +// Internal header, do not percolate upwards. +#include "hip_hcc_internal.h" +#include "hc.hpp" +#include "trace_helper.h" + +#include +#include + +namespace hip_impl +{ + hc::accelerator_view lock_stream_hip_( + hipStream_t& stream, void*& locked_stream) + { // This allocated but does not take ownership of locked_stream. If it is + // not deleted elsewhere it will leak. + using L = decltype(stream->lockopen_preKernelCommand()); + + HIP_INIT(); + + stream = ihipSyncAndResolveStream(stream); + locked_stream = new L{stream->lockopen_preKernelCommand()}; + return (*static_cast(locked_stream))->_av; + } + + void print_prelaunch_trace_( + const char* kernel_name, + dim3 num_blocks, + dim3 dim_blocks, + int group_mem_bytes, + hipStream_t stream) + { + if ((HIP_TRACE_API & (1 << TRACE_KCMD)) || + HIP_PROFILE_API || + (COMPILE_HIP_DB && (HIP_TRACE_API & (1<lockopen_preKernelCommand()); + + stream->lockclose_postKernelCommand(kernel_name, acc_v); + + delete static_cast(locked_stream); + locked_stream = nullptr; + if(HIP_PROFILE_API) { + MARKER_END(); + } + } +} \ No newline at end of file diff --git a/hipamd/src/program_state.cpp b/hipamd/src/program_state.cpp new file mode 100644 index 0000000000..47071d0236 --- /dev/null +++ b/hipamd/src/program_state.cpp @@ -0,0 +1,567 @@ +#include "../include/hip/hcc_detail/program_state.hpp" + +#include "../include/hip/hcc_detail/code_object_bundle.hpp" + +#include "hip_hcc_internal.h" +#include "trace_helper.h" + +#include "elfio/elfio.hpp" + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace ELFIO; +using namespace hip_impl; +using namespace std; + +namespace std +{ + template<> + struct hash { + size_t operator()(hsa_isa_t x) const + { + return hash{}(x.handle); + } + }; +} + +inline +constexpr +bool operator==(hsa_isa_t x, hsa_isa_t y) +{ + return x.handle == y.handle; +} + +namespace +{ + struct Symbol { + std::string name; + ELFIO::Elf64_Addr value = 0; + ELFIO::Elf_Xword size = 0; + ELFIO::Elf_Half sect_idx = 0; + std::uint8_t bind = 0; + std::uint8_t type = 0; + std::uint8_t other = 0; + }; + + inline + Symbol read_symbol(const symbol_section_accessor& section, unsigned int idx) + { + assert(idx < section.get_symbols_num()); + + Symbol r; + section.get_symbol( + idx, r.name, r.value, r.size, r.bind, r.type, r.sect_idx, r.other); + + return r; + } + + template + inline + section* find_section_if(elfio& reader, P p) + { + const auto it = find_if( + reader.sections.begin(), reader.sections.end(), move(p)); + + return it != reader.sections.end() ? *it : nullptr; + } + + vector copy_names_of_undefined_symbols( + const symbol_section_accessor& section) + { + vector r; + + for (auto i = 0u; i != section.get_symbols_num(); ++i) { + // TODO: this is boyscout code, caching the temporaries + // may be of worth. + + auto tmp = read_symbol(section, i); + if (tmp.sect_idx == SHN_UNDEF && !tmp.name.empty()) { + r.push_back(std::move(tmp.name)); + } + } + + return r; + } + + const std::unordered_map< + std::string, + std::pair>& symbol_addresses() + { + static unordered_map> r; + static once_flag f; + + call_once(f, []() { + dl_iterate_phdr([](dl_phdr_info* info, size_t, void*) { + static constexpr const char self[] = "/proc/self/exe"; + elfio reader; + + static unsigned int iter = 0u; + if (reader.load(!iter ? self : info->dlpi_name)) { + auto it = find_section_if( + reader, [](const class section* x) { + return x->get_type() == SHT_SYMTAB; + }); + + if (it) { + const symbol_section_accessor symtab{reader, it}; + + for (auto i = 0u; i != symtab.get_symbols_num(); ++i) { + auto tmp = read_symbol(symtab, i); + + if (tmp.type == STT_OBJECT && + tmp.sect_idx != SHN_UNDEF) { + const auto addr = + tmp.value + (iter ? info->dlpi_addr : 0); + r.emplace( + move(tmp.name), make_pair(addr, tmp.size)); + } + } + } + + ++iter; + } + + return 0; + }, nullptr); + }); + + return r; + } + + void associate_code_object_symbols_with_host_allocation( + const elfio& reader, + const elfio& self_reader, + section* code_object_dynsym, + section* process_symtab, + hsa_agent_t agent, + hsa_executable_t executable) + { + if (!code_object_dynsym || !process_symtab) return; + + const auto undefined_symbols = copy_names_of_undefined_symbols( + symbol_section_accessor{reader, code_object_dynsym}); + + for (auto&& x : undefined_symbols) { + if (globals().find(x) != globals().cend()) return; + + const auto it1 = symbol_addresses().find(x); + + if (it1 == symbol_addresses().cend()) { + throw runtime_error{"Global symbol: " + x + " is undefined."}; + } + + static mutex mtx; + lock_guard lck{mtx}; + + if (globals().find(x) != globals().cend()) return; + + void* p = nullptr; + hsa_amd_memory_lock( + reinterpret_cast(it1->second.first), + it1->second.second, + nullptr, // All agents. + 0, + &p); + + hsa_executable_agent_global_variable_define( + executable, agent, x.c_str(), p); + + globals().emplace(x, RAII_global{p, hsa_amd_memory_unlock}); + } + } + + vector code_object_blob_for_process() + { + static constexpr const char self[] = "/proc/self/exe"; + static constexpr const char kernel_section[] = ".kernel"; + + elfio reader; + + if (!reader.load(self)) { + throw runtime_error{"Failed to load ELF file for current process."}; + } + + auto kernels = find_section_if(reader, [](const section* x) { + return x->get_name() == kernel_section; + }); + + vector r; + if (kernels) { + r.insert( + r.end(), + kernels->get_data(), + kernels->get_data() + kernels->get_size()); + } + + return r; + } + + const unordered_map>>& code_object_blobs() + { + static unordered_map>> r; + static once_flag f; + + call_once(f, []() { + static vector> blobs{ + code_object_blob_for_process()}; + + dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void*) { + elfio tmp; + if (tmp.load(info->dlpi_name)) { + const auto it = find_section_if(tmp, [](const section* x) { + return x->get_name() == ".kernel"; + }); + + if (it) blobs.emplace_back( + it->get_data(), it->get_data() + it->get_size()); + } + return 0; + }, nullptr); + + for (auto&& blob : blobs) { + Bundled_code_header tmp{blob}; + if (valid(tmp)) { + for (auto&& bundle : bundles(tmp)) { + r[triple_to_hsa_isa(bundle.triple)].push_back( + bundle.blob); + } + } + } + }); + + return r; + } + + vector> function_names_for( + const elfio& reader, section* symtab) + { + vector> r; + symbol_section_accessor symbols{reader, symtab}; + + for (auto i = 0u; i != symbols.get_symbols_num(); ++i) { + // TODO: this is boyscout code, caching the temporaries + // may be of worth. + auto tmp = read_symbol(symbols, i); + + if (tmp.type == STT_FUNC && + tmp.sect_idx != SHN_UNDEF && + !tmp.name.empty()) { + r.emplace_back(tmp.value, tmp.name); + } + } + + return r; + } + + const vector>& function_names_for_process() + { + static constexpr const char self[] = "/proc/self/exe"; + + static vector> r; + static once_flag f; + + call_once(f, []() { + elfio reader; + + if (!reader.load(self)) { + throw runtime_error{ + "Failed to load the ELF file for the current process."}; + } + + auto symtab = find_section_if(reader, [](const section* x) { + return x->get_type() == SHT_SYMTAB; + }); + + if (symtab) r = function_names_for(reader, symtab); + }); + + return r; + } + + inline + hsa_agent_t agent(hsa_executable_symbol_t x) + { + hsa_agent_t r = {}; + hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_AGENT, &r); + + return r; + } + + inline + uint32_t group_size(hsa_executable_symbol_t x) + { + uint32_t r = 0u; + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &r); + + return r; + } + + inline + uint64_t kernel_object(hsa_executable_symbol_t x) + { + uint64_t r = 0u; + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &r); + + return r; + } + + inline + string name(hsa_executable_symbol_t x) + { + uint32_t sz = 0u; + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &sz); + + string r(sz, '\0'); + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_NAME, &r.front()); + + return r; + } + + inline + uint32_t private_size(hsa_executable_symbol_t x) + { + uint32_t r = 0u; + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &r); + + return r; + } + + inline + hsa_symbol_kind_t type(hsa_executable_symbol_t x) + { + hsa_symbol_kind_t r = {}; + hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &r); + + return r; + } + + const unordered_map>& kernels() + { + static unordered_map> r; + static once_flag f; + + call_once(f, []() { + static const auto copy_kernels = []( + hsa_executable_t, hsa_agent_t, hsa_executable_symbol_t s, void*) { + if (type(s) == HSA_SYMBOL_KIND_KERNEL) r[name(s)].push_back(s); + + return HSA_STATUS_SUCCESS; + }; + + for (auto&& agent_executables : executables()) { + for (auto&& executable : agent_executables.second) { + hsa_executable_iterate_agent_symbols( + executable, + agent_executables.first, + copy_kernels, + nullptr); + } + } + }); + + return r; + } + + void load_code_object_and_freeze_executable( + istream& file, hsa_agent_t agent, hsa_executable_t executable) + { // TODO: the following sequence is inefficient, should be refactored + // into a single load of the file and subsequent ELFIO + // processing. + static const auto cor_deleter = [](hsa_code_object_reader_t* p) { + hsa_code_object_reader_destroy(*p); + }; + + using RAII_code_reader = unique_ptr< + hsa_code_object_reader_t, decltype(cor_deleter)>; + + file.seekg(0); + + vector blob{ + istreambuf_iterator{file}, istreambuf_iterator{}}; + RAII_code_reader tmp{new hsa_code_object_reader_t, cor_deleter}; + hsa_code_object_reader_create_from_memory( + blob.data(), blob.size(), tmp.get()); + + hsa_executable_load_agent_code_object( + executable, agent, *tmp, nullptr, nullptr); + + hsa_executable_freeze(executable, nullptr); + + static vector code_readers; + static mutex mtx; + + lock_guard lck{mtx}; + code_readers.push_back(move(tmp)); + } +} + +namespace hip_impl +{ + const unordered_map>& executables() + { + static unordered_map> r; + static once_flag f; + + call_once(f, []() { + static const auto accelerators = hc::accelerator::get_all(); + + for (auto&& acc : accelerators) { + auto agent = static_cast(acc.get_hsa_agent()); + + if (!agent || !acc.is_hsa_accelerator()) continue; + + hsa_agent_iterate_isas(*agent, [](hsa_isa_t x, void* pa) { + const auto it = code_object_blobs().find(x); + + if (it != code_object_blobs().cend()) { + hsa_agent_t a = *static_cast(pa); + + for (auto&& blob : it->second) { + hsa_executable_t tmp = {}; + + hsa_executable_create_alt( + HSA_PROFILE_FULL, + HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, + nullptr, + &tmp); + + // TODO: this is massively inefficient and only + // meant for illustration. + string blob_to_str{blob.cbegin(), blob.cend()}; + stringstream istr{blob_to_str}; + tmp = load_executable(tmp, a, istr); + + if (tmp.handle) r[a].push_back(tmp); + } + } + + return HSA_STATUS_SUCCESS; + }, agent); + } + }); + + return r; + } + + const unordered_map& function_names() + { + static unordered_map r{ + function_names_for_process().cbegin(), + function_names_for_process().cend()}; + static once_flag f; + + call_once(f, []() { + dl_iterate_phdr([](dl_phdr_info* info, size_t, void*) { + elfio tmp; + if (tmp.load(info->dlpi_name)) { + const auto it = find_section_if(tmp, [](const section* x) { + return x->get_type() == SHT_SYMTAB; + }); + + if (it) { + auto n = function_names_for(tmp, it); + + for (auto&& f : n) f.first += info->dlpi_addr; + + r.insert( + make_move_iterator(n.begin()), + make_move_iterator(n.end())); + } + } + + return 0; + }, nullptr); + }); + + return r; + } + + const unordered_map< + uintptr_t, vector>>& functions() + { + static unordered_map< + uintptr_t, vector>> r; + static once_flag f; + + call_once(f, []() { + for (auto&& function : function_names()) { + const auto it = kernels().find(function.second); + + if (it != kernels().cend()) { + for (auto&& kernel_symbol : it->second) { + r[function.first].emplace_back( + agent(kernel_symbol), + Kernel_descriptor{ + kernel_object(kernel_symbol), + group_size(kernel_symbol), + private_size(kernel_symbol), + it->first}); + } + } + } + }); + + return r; + } + + unordered_map& globals() + { + static unordered_map r; + static once_flag f; + call_once(f, []() { r.reserve(symbol_addresses().size()); }); + + return r; + } + + hsa_executable_t load_executable( + hsa_executable_t executable, hsa_agent_t agent, istream& file) + { + elfio reader; + if (!reader.load(file)) { + return hsa_executable_t{}; + } + else { + // TODO: this may benefit from caching as well. + elfio self_reader; + self_reader.load("/proc/self/exe"); + + const auto symtab = + find_section_if(self_reader, [](const ELFIO::section* x) { + return x->get_type() == SHT_SYMTAB; + }); + + const auto code_object_dynsym = + find_section_if(reader, [](const ELFIO::section* x) { + return x->get_type() == SHT_DYNSYM; + }); + + associate_code_object_symbols_with_host_allocation( + reader, self_reader, code_object_dynsym, symtab, agent, executable); + + load_code_object_and_freeze_executable(file, agent, executable); + + return executable; + } + } +} // Namespace hip_impl. \ No newline at end of file diff --git a/hipamd/tests/src/deviceLib/hipDeviceMemcpy.cpp b/hipamd/tests/src/deviceLib/hipDeviceMemcpy.cpp index 46656a434c..e845ae8f2f 100644 --- a/hipamd/tests/src/deviceLib/hipDeviceMemcpy.cpp +++ b/hipamd/tests/src/deviceLib/hipDeviceMemcpy.cpp @@ -23,7 +23,7 @@ __global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In) __global__ void set(hipLaunchParm lp, uint32_t *ptr, uint8_t val, size_t size) { int tx = threadIdx.x; - memset(ptr + tx, val, (sizeof(uint32_t)*(size/LEN))); + memset(ptr + tx, val, sizeof(uint32_t)); } int main() diff --git a/hipamd/tests/src/deviceLib/hip_test_ldg.cpp b/hipamd/tests/src/deviceLib/hip_test_ldg.cpp index 63d50e881e..4db522cc10 100644 --- a/hipamd/tests/src/deviceLib/hip_test_ldg.cpp +++ b/hipamd/tests/src/deviceLib/hip_test_ldg.cpp @@ -159,11 +159,16 @@ bool dataTypesRun(){ HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice)); - hipLaunchKernel(vectoradd_float, - dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), - dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), - 0, 0, - deviceA ,deviceB ,WIDTH ,HEIGHT); + hipLaunchKernel( + vectoradd_float, + dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), + 0, + 0, + deviceA, + static_cast(deviceB), + WIDTH, + HEIGHT); HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(T), hipMemcpyDeviceToHost)); @@ -221,11 +226,16 @@ bool dataTypesRun2(){ HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T))); HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice)); - hipLaunchKernel(vectoradd_float, - dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), - dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), - 0, 0, - deviceA ,deviceB,WIDTH ,HEIGHT); + hipLaunchKernel( + vectoradd_float, + dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), + 0, + 0, + deviceA, + static_cast(deviceB), + WIDTH, + HEIGHT); HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(T), hipMemcpyDeviceToHost)); @@ -281,11 +291,16 @@ bool dataTypesRun4(){ HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice)); - hipLaunchKernel(vectoradd_float, - dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), - dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), - 0, 0, - deviceA ,deviceB ,WIDTH ,HEIGHT); + hipLaunchKernel( + vectoradd_float, + dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), + 0, + 0, + deviceA, + static_cast(deviceB), + WIDTH, + HEIGHT); HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(T), hipMemcpyDeviceToHost)); diff --git a/hipamd/tests/src/runtimeApi/event/hipEventRecord.cpp b/hipamd/tests/src/runtimeApi/event/hipEventRecord.cpp index 5606b4ab9b..5f33882234 100644 --- a/hipamd/tests/src/runtimeApi/event/hipEventRecord.cpp +++ b/hipamd/tests/src/runtimeApi/event/hipEventRecord.cpp @@ -69,7 +69,16 @@ int main(int argc, char *argv[]) // Record the start event HIPCHECK (hipEventRecord(start, NULL)); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N); + hipLaunchKernel( + HipTest::vectorADD, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(A_d), + static_cast(B_d), + C_d, + N); HIPCHECK (hipEventRecord(stop, NULL)); diff --git a/hipamd/tests/src/runtimeApi/event/record_event.cpp b/hipamd/tests/src/runtimeApi/event/record_event.cpp index b9653bf522..a7b99749cb 100644 --- a/hipamd/tests/src/runtimeApi/event/record_event.cpp +++ b/hipamd/tests/src/runtimeApi/event/record_event.cpp @@ -77,7 +77,16 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_ HIPCHECK(hipEventRecord(timingDisabled, stream)); // sandwhich a kernel: HIPCHECK(hipEventRecord(start, stream)); - hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, stream, C_d, C_h, numElements, count); + hipLaunchKernelGGL( + HipTest::addCountReverse, + dim3(blocks), + dim3(threadsPerBlock), + 0, + stream, + static_cast(C_d), + C_h, + numElements, + count); HIPCHECK(hipEventRecord(stop, stream)); diff --git a/hipamd/tests/src/runtimeApi/memory/hipMemcpy.cpp b/hipamd/tests/src/runtimeApi/memory/hipMemcpy.cpp index d8438fa848..b3f25658fc 100644 --- a/hipamd/tests/src/runtimeApi/memory/hipMemcpy.cpp +++ b/hipamd/tests/src/runtimeApi/memory/hipMemcpy.cpp @@ -243,7 +243,16 @@ void memcpytest2(DeviceMemory *dmem, HostMemory *hmem, size_t numElements, HIPCHECK ( hipMemcpy(dmem->B_d(), hmem->B_h(), sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); } - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, dmem->A_d(), dmem->B_d(), dmem->C_d(), numElements); + hipLaunchKernel( + HipTest::vectorADD, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(dmem->A_d()), + static_cast(dmem->B_d()), + dmem->C_d(), + numElements); if (useDeviceToDevice) { // Do an extra device-to-device copy here to mix things up: diff --git a/hipamd/tests/src/runtimeApi/memory/hipMemcpyDtoD.cpp b/hipamd/tests/src/runtimeApi/memory/hipMemcpyDtoD.cpp index ccb02b74ce..c64b01f8a7 100644 --- a/hipamd/tests/src/runtimeApi/memory/hipMemcpyDtoD.cpp +++ b/hipamd/tests/src/runtimeApi/memory/hipMemcpyDtoD.cpp @@ -49,21 +49,39 @@ int main() HIPCHECK(hipMalloc(&Y_d,Nbytes)); HIPCHECK(hipMalloc(&Z_d,Nbytes)); - + HIPCHECK(hipSetDevice(0)); HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N); + hipLaunchKernel( + HipTest::vectorADD, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(A_d), + static_cast(B_d), + C_d, + N); HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); HIPCHECK(hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); - - + + HIPCHECK(hipSetDevice(1)); - HIPCHECK(hipMemcpyDtoD(X_d, A_d, Nbytes)); + HIPCHECK(hipMemcpyDtoD(X_d, A_d, Nbytes)); HIPCHECK(hipMemcpyDtoD(Y_d, B_d, Nbytes)); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N); + hipLaunchKernel( + HipTest::vectorADD, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(X_d), + static_cast(Y_d), + Z_d, + N); HIPCHECK(hipMemcpyDtoH(C_h, Z_d, Nbytes)); HIPCHECK(hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); @@ -73,8 +91,8 @@ int main() HIPCHECK(hipFree(Y_d)); HIPCHECK(hipFree(Z_d)); } - + passed(); - + } diff --git a/hipamd/tests/src/runtimeApi/memory/hipMemcpyDtoDAsync.cpp b/hipamd/tests/src/runtimeApi/memory/hipMemcpyDtoDAsync.cpp index 5c99b43564..6d21ac62e7 100644 --- a/hipamd/tests/src/runtimeApi/memory/hipMemcpyDtoDAsync.cpp +++ b/hipamd/tests/src/runtimeApi/memory/hipMemcpyDtoDAsync.cpp @@ -50,25 +50,43 @@ int main() HIPCHECK(hipMalloc(&Y_d,Nbytes)); HIPCHECK(hipMalloc(&Z_d,Nbytes)); - + HIPCHECK(hipSetDevice(0)); HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N); + hipLaunchKernel( + HipTest::vectorADD, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(A_d), + static_cast(B_d), + C_d, + N); HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); HIPCHECK(hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); - - HIPCHECK(hipStreamCreate(&s)); + + HIPCHECK(hipStreamCreate(&s)); HIPCHECK(hipSetDevice(1)); - HIPCHECK(hipMemcpyDtoDAsync(X_d, A_d, Nbytes, s)); + HIPCHECK(hipMemcpyDtoDAsync(X_d, A_d, Nbytes, s)); HIPCHECK(hipMemcpyDtoDAsync(Y_d, B_d, Nbytes, s)); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N); + hipLaunchKernel( + HipTest::vectorADD, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(X_d), + static_cast(Y_d), + Z_d, + N); HIPCHECK(hipMemcpyDtoHAsync(C_h, Z_d, Nbytes, s)); HIPCHECK(hipStreamSynchronize(s)); HIPCHECK(hipDeviceSynchronize()); - + HipTest::checkVectorADD(A_h, B_h, C_h, N); HIPCHECK(hipStreamDestroy(s)); HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); @@ -78,7 +96,7 @@ int main() } passed(); - + } diff --git a/hipamd/tests/src/runtimeApi/memory/hipMemcpyPeer.cpp b/hipamd/tests/src/runtimeApi/memory/hipMemcpyPeer.cpp index 7e2fc2d3d0..95b19c1090 100644 --- a/hipamd/tests/src/runtimeApi/memory/hipMemcpyPeer.cpp +++ b/hipamd/tests/src/runtimeApi/memory/hipMemcpyPeer.cpp @@ -48,24 +48,42 @@ int main() HIPCHECK(hipMalloc(&X_d,Nbytes)); HIPCHECK(hipMalloc(&Y_d,Nbytes)); HIPCHECK(hipMalloc(&Z_d,Nbytes)); - + HIPCHECK(hipSetDevice(0)); HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N); + hipLaunchKernel( + HipTest::vectorADD, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(A_d), + static_cast(B_d), + C_d, + N); HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); HIPCHECK(hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); - + HIPCHECK(hipSetDevice(1)); hipMemcpyPeer(X_d, 1, A_d, 0, Nbytes); //this call is eqv to hipMemcpy(hipMemcpyD2D) which goes via stg bufs. hipMemcpyPeer(Y_d, 1, B_d, 0, Nbytes); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N); + hipLaunchKernel( + HipTest::vectorADD, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(X_d), + static_cast(Y_d), + Z_d, + N); HIPCHECK(hipMemcpy(C_h, Z_d, Nbytes, hipMemcpyDeviceToHost)); HIPCHECK(hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); - + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); HIPCHECK(hipFree(X_d)); HIPCHECK(hipFree(Y_d)); @@ -74,7 +92,7 @@ int main() passed(); - + } diff --git a/hipamd/tests/src/runtimeApi/memory/hipMemcpyPeerAsync.cpp b/hipamd/tests/src/runtimeApi/memory/hipMemcpyPeerAsync.cpp index b01a0aeb1d..2382850ec4 100644 --- a/hipamd/tests/src/runtimeApi/memory/hipMemcpyPeerAsync.cpp +++ b/hipamd/tests/src/runtimeApi/memory/hipMemcpyPeerAsync.cpp @@ -51,26 +51,44 @@ int main() HIPCHECK(hipMalloc(&Y_d,Nbytes)); HIPCHECK(hipMalloc(&Z_d,Nbytes)); - + HIPCHECK(hipSetDevice(0)); HIPCHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); HIPCHECK ( hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N); + hipLaunchKernel( + HipTest::vectorADD, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(A_d), + static_cast(B_d), + C_d, + N); HIPCHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); HIPCHECK (hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); - - HIPCHECK(hipStreamCreate(&s)); + + HIPCHECK(hipStreamCreate(&s)); HIPCHECK(hipSetDevice(1)); HIPCHECK(hipMemcpyPeerAsync(X_d, 1, A_d, 0, Nbytes, s)); HIPCHECK(hipMemcpyPeerAsync(Y_d, 1, B_d, 0, Nbytes, s)); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N); + hipLaunchKernel( + HipTest::vectorADD, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(X_d), + static_cast(Y_d), + Z_d, + N); HIPCHECK ( hipMemcpy(C_h, Z_d, Nbytes, hipMemcpyDeviceToHost)); HIPCHECK (hipDeviceSynchronize()); HIPCHECK (hipStreamSynchronize(s)); HipTest::checkVectorADD(A_h, B_h, C_h, N); - + HIPCHECK(hipStreamDestroy(s)); HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); HIPCHECK(hipFree(X_d)); @@ -79,7 +97,7 @@ int main() } passed(); - + } diff --git a/hipamd/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp b/hipamd/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp index 316f50c01b..9a09e7e95c 100644 --- a/hipamd/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp +++ b/hipamd/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp @@ -63,7 +63,16 @@ void simpleTest1() HIPCHECK ( memcopy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); HIPCHECK ( memcopy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N); + hipLaunchKernel( + HipTest::vectorADD, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(A_d), + static_cast(B_d), + C_d, + N); HIPCHECK ( memcopy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); diff --git a/hipamd/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp b/hipamd/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp index 9b2e749cf9..9d274543ab 100644 --- a/hipamd/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp +++ b/hipamd/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp @@ -90,7 +90,16 @@ void simpleVectorAdd(size_t numElements, int iters, hipStream_t stream) // This is the null stream? //hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); - hipLaunchKernel(HipTest::vectorADDReverse, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); + hipLaunchKernel( + HipTest::vectorADDReverse, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(A_d), + static_cast(B_d), + C_d, + numElements); MemTraits::Copy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, stream); diff --git a/hipamd/tests/src/runtimeApi/stream/hipNullStream.cpp b/hipamd/tests/src/runtimeApi/stream/hipNullStream.cpp index 8e46a69273..7a4cd978a1 100644 --- a/hipamd/tests/src/runtimeApi/stream/hipNullStream.cpp +++ b/hipamd/tests/src/runtimeApi/stream/hipNullStream.cpp @@ -128,7 +128,17 @@ void Streamer::enqueAsync() { printf ("testing: %s numElements=%zu size=%6.2fMB\n", __func__, _numElements, _numElements * sizeof(T) / 1024.0/1024.0); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements); - hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _B_d, _C_d, _numElements, p_repeat); + hipLaunchKernel( + vectorADDRepeat, + dim3(blocks), + dim3(threadsPerBlock), + 0, + _stream, + static_cast(_A_d), + static_cast(_B_d), + _C_d, + _numElements, + p_repeat); } @@ -225,7 +235,17 @@ int main(int argc, char *argv[]) auto lastStreamer = streamers[s - 1]; // Dispatch to NULL stream, should wait for prior async activity to complete before beginning: - hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/); + hipLaunchKernel( + vectorADDRepeat, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0/*nullstream*/, + static_cast(lastStreamer->_C_d), + static_cast(lastStreamer->_C_d), + nullStreamer->_C_d, + numElements, + 1/*repeat*/); if (p_db) { @@ -257,7 +277,17 @@ int main(int argc, char *argv[]) auto lastStreamer = streamers[s - 1]; // Dispatch to NULL stream, should wait for prior async activity to complete before beginning: - hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/); + hipLaunchKernel( + vectorADDRepeat, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0/*nullstream*/, + static_cast(lastStreamer->_C_d), + static_cast(lastStreamer->_C_d), + nullStreamer->_C_d, + numElements, + 1/*repeat*/); nullStreamer->D2H(); diff --git a/hipamd/tests/src/runtimeApi/stream/hipStreamSync2.cpp b/hipamd/tests/src/runtimeApi/stream/hipStreamSync2.cpp index 4c49d80c05..962737774d 100644 --- a/hipamd/tests/src/runtimeApi/stream/hipStreamSync2.cpp +++ b/hipamd/tests/src/runtimeApi/stream/hipStreamSync2.cpp @@ -97,7 +97,16 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode s unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); // Launch kernel into null stream, should result in C_h == count. - hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, 0 /*stream*/, C_d, C_h, numElements, count); + hipLaunchKernelGGL( + HipTest::addCountReverse, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0 /*stream*/, + static_cast(C_d), + C_h, + numElements, + count); HIPCHECK(hipEventRecord(stop, 0/*default*/)); switch (syncMode) { diff --git a/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp b/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp index cf463be76a..a7a930b4f6 100644 --- a/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp +++ b/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp @@ -163,9 +163,27 @@ void Streamer::runAsyncAfter(Streamer *depStreamer, bool waitSameStream) unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements); if (_commandType == COMMAND_ADD_REVERSE) { - hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); + hipLaunchKernelGGL( + HipTest::addCountReverse, + dim3(blocks), + dim3(threadsPerBlock), + 0, + _stream, + static_cast(_A_d), + _C_d, + static_cast(_numElements), + static_cast(p_count)); } else if (_commandType == COMMAND_ADD_FORWARD) { - hipLaunchKernelGGL(HipTest::addCount, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); + hipLaunchKernelGGL( + HipTest::addCount, + dim3(blocks), + dim3(threadsPerBlock), + 0, + _stream, + static_cast(_A_d), + _C_d, + _numElements, + static_cast(p_count)); } else if (_commandType == COMMAND_COPY) { HIPCHECK(hipMemcpyAsync(_C_d, _A_d, _numElements * sizeof(T), hipMemcpyDeviceToDevice, _stream)); } else {