diff --git a/include/hip/hcc_detail/code_object_bundle.hpp b/include/hip/hcc_detail/code_object_bundle.hpp new file mode 100644 index 0000000000..080132c561 --- /dev/null +++ b/include/hip/hcc_detail/code_object_bundle.hpp @@ -0,0 +1,134 @@ +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace hip_impl +{ + hsa_isa_t triple_to_hsa_isa(const std::string& triple); + + struct Bundled_code { + union { + struct { + std::uint64_t offset; + std::uint64_t bundle_sz; + std::uint64_t triple_sz; + }; + std::uint8_t cbuf[ + sizeof(offset) + sizeof(bundle_sz) + sizeof(triple_sz)]; + }; + std::string triple; + std::vector blob; + }; + + class Bundled_code_header { + // DATA - STATICS + static constexpr const char magic_string_[] = + "__CLANG_OFFLOAD_BUNDLE__"; + static constexpr auto magic_string_sz_ = sizeof(magic_string_) - 1; + + // DATA + union { + struct { + std::uint8_t bundler_magic_string_[magic_string_sz_]; + std::uint64_t bundle_cnt_; + }; + std::uint8_t cbuf_[ + sizeof(bundler_magic_string_) + sizeof(bundle_cnt_)]; + }; + std::vector bundles_; + + // FRIENDS - MANIPULATORS + template + friend + inline + bool read( + RandomAccessIterator f, + RandomAccessIterator l, + Bundled_code_header& x) + { + std::copy_n(f, sizeof(x.cbuf_), x.cbuf_); + + if (valid(x)) { + x.bundles_.resize(x.bundle_cnt_); + + auto it = f + sizeof(x.cbuf_); + for (auto&& y : x.bundles_) { + std::copy_n(it, sizeof(y.cbuf), y.cbuf); + it += sizeof(y.cbuf); + + y.triple.insert(y.triple.cend(), it, it + y.triple_sz); + + std::copy_n( + f + y.offset, y.bundle_sz, std::back_inserter(y.blob)); + + it += y.triple_sz; + } + + return true; + } + + return false; + } + friend + inline + bool read(const std::vector& blob, Bundled_code_header& x) + { + return read(blob.cbegin(), blob.cend(), x); + } + friend + inline + bool read(std::istream& is, Bundled_code_header& x) + { + return read(std::vector{ + std::istreambuf_iterator{is}, + std::istreambuf_iterator{}}, + x); + } + + // FRIENDS - ACCESSORS + friend + inline + bool valid(const Bundled_code_header& x) + { + return std::equal( + x.bundler_magic_string_, + x.bundler_magic_string_ + magic_string_sz_, + x.magic_string_); + } + friend + inline + const std::vector& bundles(const Bundled_code_header& x) + { + return x.bundles_; + } + public: + // CREATORS + Bundled_code_header() = default; + template + Bundled_code_header(RandomAccessIterator f, RandomAccessIterator l); + explicit + Bundled_code_header(const std::vector& blob); + Bundled_code_header(const Bundled_code_header&) = default; + Bundled_code_header(Bundled_code_header&&) = default; + ~Bundled_code_header() = default; + + // MANIPULATORS + Bundled_code_header& operator=(const Bundled_code_header&) = default; + Bundled_code_header& operator=(Bundled_code_header&&) = default; + }; + + // CREATORS + template + Bundled_code_header::Bundled_code_header(I f, I l) : Bundled_code_header{} + { + read(f, l, *this); + } +} // Namespace hip_impl. \ No newline at end of file diff --git a/include/hip/hcc_detail/grid_launch_GGL.hpp b/include/hip/hcc_detail/grid_launch_GGL.hpp index 4c632f9d68..e3fa3331ac 100644 --- a/include/hip/hcc_detail/grid_launch_GGL.hpp +++ b/include/hip/hcc_detail/grid_launch_GGL.hpp @@ -24,984 +24,139 @@ THE SOFTWARE. #if GENERIC_GRID_LAUNCH == 1 +#include "code_object_bundle.hpp" #include "concepts.hpp" #include "helpers.hpp" +#include "program_state.hpp" #include "hc.hpp" #include "hip/hip_hcc.h" #include "hip_runtime.h" +#include +#include #include #include +#include #include +#include +#include #include +#include #include +#include namespace hip_impl { - namespace + template< + typename T, + typename std::enable_if{}>::type* = nullptr> + inline + T round_up_to_next_multiple_nonnegative(T x, T y) { - struct New_grid_launch_tag {}; - struct Old_grid_launch_tag {}; - - template - class RAII_guard { - D dtor_; - public: - RAII_guard() = default; - - RAII_guard(const C& ctor, D dtor) : dtor_{std::move(dtor)} - { - ctor(); - } - - RAII_guard(const RAII_guard&) = default; - RAII_guard(RAII_guard&&) = default; - - RAII_guard& operator=(const RAII_guard&) = default; - RAII_guard& operator=(RAII_guard&&) = default; - - ~RAII_guard() { dtor_(); } - }; - - template - RAII_guard make_RAII_guard(const C& ctor, D dtor) - { - return RAII_guard{ctor, std::move(dtor)}; - } - - template - using is_new_grid_launch_t = typename std::conditional< - is_callable{}, - New_grid_launch_tag, - Old_grid_launch_tag>::type; + T tmp = x + y - 1; + return tmp - tmp % y; } - // TODO: - dispatch rank should be derived from the domain dimensions passed - // in, and not always assumed to be 3; - - template - requires(Domain == {Ts...}) inline - void grid_launch_hip_impl_( - New_grid_launch_tag, - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - const hc::accelerator_view& acc_v, - K k) + std::vector make_kernarg() { - const auto d = hc::extent<3>{ - num_blocks.z * dim_blocks.z, - num_blocks.y * dim_blocks.y, - num_blocks.x * dim_blocks.x}.tile_with_dynamic( - dim_blocks.z, - dim_blocks.y, - dim_blocks.x, - group_mem_bytes); - - try { - hc::parallel_for_each(acc_v, d, k); - } - catch (std::exception& ex) { - std::cerr << "Failed in " << __func__ << ", with exception: " - << ex.what() << std::endl; - throw; - } + return {}; } - // TODO: these are workarounds, they should be removed. - - hc::accelerator_view lock_stream_hip_(hipStream_t&, void*&); - void print_prelaunch_trace_(const char*, dim3, dim3, int, hipStream_t); - void unlock_stream_hip_( - hipStream_t, void*, const char*, hc::accelerator_view*); - - template - requires(Domain == {Ts...}) inline - void grid_launch_hip_impl_( - New_grid_launch_tag, - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, + std::vector make_kernarg(std::vector kernarg) + { + return kernarg; + } + + template + inline + std::vector make_kernarg(std::vector kernarg, T x) + { + kernarg.resize( + round_up_to_next_multiple_nonnegative(kernarg.size(), alignof(T)) + + sizeof(T)); + + new (kernarg.data() + kernarg.size() - sizeof(T)) T{std::move(x)}; + + return kernarg; + } + + template + inline + std::vector make_kernarg( + std::vector kernarg, T x, Ts... xs) + { + return make_kernarg( + make_kernarg(std::move(kernarg), std::move(x)), std::move(xs)...); + } + + template + inline + std::vector make_kernarg(Ts... xs) + { + std::vector kernarg; + kernarg.reserve(sizeof(std::tuple)); + + return make_kernarg(std::move(kernarg), std::move(xs)...); + } + + void hipLaunchKernelGGLImpl( + std::uintptr_t function_address, + const dim3& numBlocks, + const dim3& dimBlocks, + std::uint32_t sharedMemBytes, hipStream_t stream, - const char* kernel_name, - K k) - { - void* lck_stream = nullptr; - auto acc_v = lock_stream_hip_(stream, lck_stream); - auto stream_guard = make_RAII_guard( - std::bind( - print_prelaunch_trace_, - kernel_name, - num_blocks, - dim_blocks, - group_mem_bytes, - stream), - std::bind( - unlock_stream_hip_, stream, lck_stream, kernel_name, &acc_v)); + void** kernarg); +} // Namespace hip_impl. - try { - grid_launch_hip_impl_( - New_grid_launch_tag{}, - std::move(num_blocks), - std::move(dim_blocks), - group_mem_bytes, - acc_v, - std::move(k)); - } - catch (std::exception& ex) { - std::cerr << "Failed in " << __func__ << ", with exception: " - << ex.what() << std::endl; - throw; - } - } +template +inline +void hipLaunchKernelGGL( + F kernel, + const dim3& numBlocks, + const dim3& dimBlocks, + std::uint32_t sharedMemBytes, + hipStream_t stream, + Args... args) +{ + auto kernarg = hip_impl::make_kernarg(std::move(args)...); + std::size_t kernarg_size = kernarg.size(); - template - requires(Domain == {hipLaunchParm, Ts...}) - inline - void grid_launch_hip_impl_( - Old_grid_launch_tag, - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream, - K k) - { - grid_launch_hip_impl_( - New_grid_launch_tag{}, - std::move(num_blocks), - std::move(dim_blocks), - group_mem_bytes, - std::move(stream), - std::move(k)); - } + void* config[] = { + HIP_LAUNCH_PARAM_BUFFER_POINTER, kernarg.data(),//&kernarg, + HIP_LAUNCH_PARAM_BUFFER_SIZE, &kernarg_size, + HIP_LAUNCH_PARAM_END + }; - template - requires(Domain == {hipLaunchParm, Ts...}) - inline - void grid_launch_hip_impl_( - Old_grid_launch_tag, - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream, - const char* kernel_name, - K k) - { - grid_launch_hip_impl_( - New_grid_launch_tag{}, - std::move(num_blocks), - std::move(dim_blocks), - group_mem_bytes, - std::move(stream), - kernel_name, - std::move(k)); - } - - template - requires(Domain == {Ts...}) - inline - std::enable_if_t::value> grid_launch_hip_( - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream, - const char* kernel_name, - K k) - { - grid_launch_hip_impl_( - is_new_grid_launch_t{}, - std::move(num_blocks), - std::move(dim_blocks), - group_mem_bytes, - std::move(stream), - kernel_name, - std::move(k)); - } - - template - requires(Domain == {Ts...}) - inline - std::enable_if_t::value> grid_launch_hip_( - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream, - K k) - { - grid_launch_hip_impl_( - is_new_grid_launch_t{}, - std::move(num_blocks), - std::move(dim_blocks), - group_mem_bytes, - std::move(stream), - std::move(k)); - } - - // TODO: these are temporary and purposefully noisy and disruptive. - #define make_kernel_name_hip(k, n)\ - HIP_kernel_functor_name_begin ## _ ## k ## _ ## \ - HIP_kernel_functor_name_end ## _ ## n - - #define make_kernel_functor_hip_30(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ - p24, p25, p26, p27)\ - struct make_kernel_name_hip(function_name, 28) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - std::decay_t _p21_;\ - std::decay_t _p22_;\ - std::decay_t _p23_;\ - std::decay_t _p24_;\ - std::decay_t _p25_;\ - std::decay_t _p26_;\ - std::decay_t _p27_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_,\ - _p26_, _p27_);\ - }\ - } - #define make_kernel_functor_hip_29(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ - p24, p25, p26)\ - struct make_kernel_name_hip(function_name, 27) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - std::decay_t _p21_;\ - std::decay_t _p22_;\ - std::decay_t _p23_;\ - std::decay_t _p24_;\ - std::decay_t _p25_;\ - std::decay_t _p26_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_,\ - _p26_);\ - }\ - } - #define make_kernel_functor_hip_28(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ - p24, p25)\ - struct make_kernel_name_hip(function_name, 26) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - std::decay_t _p21_;\ - std::decay_t _p22_;\ - std::decay_t _p23_;\ - std::decay_t _p24_;\ - std::decay_t _p25_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_);\ - }\ - } - #define make_kernel_functor_hip_27(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ - p24)\ - struct make_kernel_name_hip(function_name, 25) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - std::decay_t _p21_;\ - std::decay_t _p22_;\ - std::decay_t _p23_;\ - std::decay_t _p24_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_);\ - }\ - } - #define make_kernel_functor_hip_26(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23)\ - struct make_kernel_name_hip(function_name, 24) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - std::decay_t _p21_;\ - std::decay_t _p22_;\ - std::decay_t _p23_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_, _p23_);\ - }\ - } - #define make_kernel_functor_hip_25(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22)\ - struct make_kernel_name_hip(function_name, 23) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - std::decay_t _p21_;\ - std::decay_t _p22_;\ - __attribute__((used, flatten))\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_);\ - }\ - } - #define make_kernel_functor_hip_24(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21)\ - struct make_kernel_name_hip(function_name, 22) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - std::decay_t _p21_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_);\ - }\ - } - #define make_kernel_functor_hip_23(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20)\ - struct make_kernel_name_hip(function_name, 21) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_);\ - }\ - } - #define make_kernel_functor_hip_22(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19)\ - struct make_kernel_name_hip(function_name, 20) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_);\ - }\ - } - #define make_kernel_functor_hip_21(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18)\ - struct make_kernel_name_hip(function_name, 19) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_);\ - }\ - } - #define make_kernel_functor_hip_20(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17)\ - struct make_kernel_name_hip(function_name, 18) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_);\ - }\ - } - #define make_kernel_functor_hip_19(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16)\ - struct make_kernel_name_hip(function_name, 17) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_);\ - }\ - } - #define make_kernel_functor_hip_18(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15)\ - struct make_kernel_name_hip(function_name, 16) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_);\ - }\ - } - #define make_kernel_functor_hip_17(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14)\ - struct make_kernel_name_hip(function_name, 15) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_);\ - }\ - } - #define make_kernel_functor_hip_16(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13)\ - struct make_kernel_name_hip(function_name, 14) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_);\ - }\ - } - #define make_kernel_functor_hip_15(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12)\ - struct make_kernel_name_hip(function_name, 13) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_);\ - }\ - } - #define make_kernel_functor_hip_14(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11)\ - struct make_kernel_name_hip(function_name, 12) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_);\ - }\ - } - #define make_kernel_functor_hip_13(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10)\ - struct make_kernel_name_hip(function_name, 11) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_);\ - }\ - } - #define make_kernel_functor_hip_12(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9)\ - struct make_kernel_name_hip(function_name, 10) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_);\ - }\ - } - #define make_kernel_functor_hip_11(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8)\ - struct make_kernel_name_hip(function_name, 9) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_);\ - }\ - } - #define make_kernel_functor_hip_10(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7)\ - struct make_kernel_name_hip(function_name, 8) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_);\ - }\ - } - #define make_kernel_functor_hip_9(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6)\ - struct make_kernel_name_hip(function_name, 7) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_);\ - }\ - } - #define make_kernel_functor_hip_8(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5)\ - struct make_kernel_name_hip(function_name, 6) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_);\ - }\ - } - #define make_kernel_functor_hip_7(\ - function_name, kernel_name, p0, p1, p2, p3, p4)\ - struct make_kernel_name_hip(function_name, 5) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_);\ - }\ - } - #define make_kernel_functor_hip_6(function_name, kernel_name, p0, p1, p2, p3)\ - struct make_kernel_name_hip(function_name, 4) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_, _p1_, _p2_, _p3_);\ - }\ - } - #define make_kernel_functor_hip_5(function_name, kernel_name, p0, p1, p2)\ - struct make_kernel_name_hip(function_name, 3) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_, _p1_, _p2_);\ - }\ - } - #define make_kernel_functor_hip_4(function_name, kernel_name, p0, p1)\ - struct make_kernel_name_hip(function_name, 2) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_, _p1_);\ - }\ - } - #define fofo(f, n) kernel_prefix_hip ## f ## kernel_suffix_hip ## n - #define make_kernel_functor_hip_3(function_name, kernel_name, p0)\ - struct make_kernel_name_hip(function_name, 1) {\ - std::decay_t _p0_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_);\ - }\ - } - #define make_kernel_functor_hip_2(function_name, kernel_name)\ - struct make_kernel_name_hip(function_name, 0) {\ - void operator()(const hc::tiled_index<3>&) [[hc]]\ - {\ - return kernel_name(hipLaunchParm{});\ - }\ - } - #define make_kernel_functor_hip_1(...) - #define make_kernel_functor_hip_0(...) - #define make_kernel_functor_hip_(...)\ - overload_macro_hip_(make_kernel_functor_hip_, __VA_ARGS__) - - - #define hipLaunchNamedKernelGGL(\ - function_name,\ - kernel_name,\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - ...)\ - do {\ - make_kernel_functor_hip_(function_name, kernel_name, __VA_ARGS__)\ - hip_kernel_functor_impl_{__VA_ARGS__};\ - hip_impl::grid_launch_hip_(\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - #kernel_name,\ - hip_kernel_functor_impl_);\ - } while(0) - - #define hipLaunchKernelGGL(\ - kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)\ - do {\ - hipLaunchNamedKernelGGL(\ - unnamed,\ - kernel_name,\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - ##__VA_ARGS__);\ - } while (0) - - #define hipLaunchKernel(\ - kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)\ - do {\ - hipLaunchKernelGGL(\ - kernel_name,\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - hipLaunchParm{},\ - ##__VA_ARGS__);\ - } while(0) + hip_impl::hipLaunchKernelGGLImpl( + reinterpret_cast(kernel), + numBlocks, + dimBlocks, + sharedMemBytes, + stream, + &config[0]); } + +template +inline +void hipLaunchKernel( + F kernel, + const dim3& numBlocks, + const dim3& dimBlocks, + std::uint32_t groupMemBytes, + hipStream_t stream, + Args... args) +{ + hipLaunchKernelGGL( + kernel, + numBlocks, + dimBlocks, + groupMemBytes, + stream, + hipLaunchParm{}, + std::move(args)...); +} + #endif //GENERIC_GRID_LAUNCH diff --git a/include/hip/hcc_detail/hip_runtime.h b/include/hip/hcc_detail/hip_runtime.h index 370ac2abbb..d3211ed3f5 100644 --- a/include/hip/hcc_detail/hip_runtime.h +++ b/include/hip/hcc_detail/hip_runtime.h @@ -53,7 +53,7 @@ THE SOFTWARE. // define HIP_ENABLE_PRINTF to enable printf #ifdef HIP_ENABLE_PRINTF #define HCC_ENABLE_ACCELERATOR_PRINTF 1 -#endif +#endif //--- // Remainder of this file only compiles with HCC @@ -481,7 +481,7 @@ do {\ type* var = \ (type*)__get_dynamicgroupbaseptr(); \ -#define HIP_DYNAMIC_SHARED_ATTRIBUTE +#define HIP_DYNAMIC_SHARED_ATTRIBUTE diff --git a/include/hip/hcc_detail/host_defines.h b/include/hip/hcc_detail/host_defines.h index b2e7ac2617..56cfa0cc0f 100644 --- a/include/hip/hcc_detail/host_defines.h +++ b/include/hip/hcc_detail/host_defines.h @@ -44,7 +44,8 @@ THE SOFTWARE. #if GENERIC_GRID_LAUNCH == 0 #define __global__ __attribute__((hc_grid_launch)) __attribute__((used)) #else -#define __global__ __attribute__((annotate("hip__global__"), hc, used, weak)) +#define __global__ \ + __attribute__((annotate("__HIP_global_function__"), cpu, hc, used)) #endif //GENERIC_GRID_LAUNCH #define __noinline__ __attribute__((noinline)) diff --git a/include/hip/hcc_detail/program_state.hpp b/include/hip/hcc_detail/program_state.hpp new file mode 100644 index 0000000000..03701725eb --- /dev/null +++ b/include/hip/hcc_detail/program_state.hpp @@ -0,0 +1,60 @@ +/* +Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include + +struct ihipModuleSymbol_t; +using hipFunction_t = ihipModuleSymbol_t*; + +namespace hip_impl +{ + struct Kernel_descriptor { + std::uint64_t kernel_object_; + std::uint32_t group_size_; + std::uint32_t private_size_; + std::string name_; + + operator hipFunction_t() const + { // TODO: this is awful and only meant for illustration. + return reinterpret_cast( + const_cast(this)); + } + }; + + const std::unordered_map< + std::uintptr_t, + std::vector>>& functions(); + const std::unordered_map& function_names(); + + hsa_executable_t load_executable( + hsa_executable_t executable, hsa_agent_t agent, std::istream& file); +} // Namespace hip_impl. \ No newline at end of file diff --git a/src/code_object_bundle.cpp b/src/code_object_bundle.cpp new file mode 100644 index 0000000000..d7d2cd1e10 --- /dev/null +++ b/src/code_object_bundle.cpp @@ -0,0 +1,39 @@ +#include "../include/hip/hcc_detail/code_object_bundle.hpp" + +#include + +#include +#include +#include + +hsa_isa_t hip_impl::triple_to_hsa_isa(const std::string& triple) +{ + static constexpr const char prefix[] = "hcc-amdgcn--amdhsa-gfx"; + static constexpr std::size_t prefix_sz = sizeof(prefix) - 1; + + hsa_isa_t r = {}; + + auto idx = triple.find(prefix); + + if (idx != std::string::npos) { + idx += prefix_sz; + std::string tmp = "AMD:AMDGPU"; + while (idx != triple.size()) { + tmp.push_back(':'); + tmp.push_back(triple[idx++]); + } + + hsa_isa_from_name(tmp.c_str(), &r); + } + + return r; +} + +// DATA - STATICS +constexpr const char hip_impl::Bundled_code_header::magic_string_[]; + +// CREATORS +hip_impl::Bundled_code_header::Bundled_code_header( + const std::vector& x) + : Bundled_code_header{x.cbegin(), x.cend()} +{} \ No newline at end of file diff --git a/src/grid_launch.cpp b/src/grid_launch.cpp index fd5c2a1573..4a26f66c8c 100644 --- a/src/grid_launch.cpp +++ b/src/grid_launch.cpp @@ -21,76 +21,118 @@ THE SOFTWARE. */ #include "hip/hcc_detail/grid_launch_GGL.hpp" +#include "hip/hcc_detail/program_state.hpp" + +#include "hip/hip_runtime_api.h" // Internal header, do not percolate upwards. #include "hip_hcc_internal.h" #include "hc.hpp" #include "trace_helper.h" +#include +#include +#include +#include + #include -#include + +using namespace hc; +using namespace std; namespace hip_impl { - hc::accelerator_view lock_stream_hip_( - hipStream_t& stream, void*& locked_stream) - { // This allocated but does not take ownership of locked_stream. If it is - // not deleted elsewhere it will leak. - using L = decltype(stream->lockopen_preKernelCommand()); - - HIP_INIT(); - - stream = ihipSyncAndResolveStream(stream); - locked_stream = new L{stream->lockopen_preKernelCommand()}; - return (*static_cast(locked_stream))->_av; - } - - void print_prelaunch_trace_( - const char* kernel_name, - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream) + namespace { - if ((HIP_TRACE_API & (1 << TRACE_KCMD)) || - HIP_PROFILE_API || - (COMPILE_HIP_DB && (HIP_TRACE_API & (1<second; + } + + inline + string name(hsa_agent_t agent) + { + char n[64] = {}; + hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, n); + + return string{n}; + } + + inline + hsa_agent_t target_agent(hipStream_t stream) + { + if (stream) { + return *static_cast( + stream->locked_getAv()->get_hsa_agent()); + } + else if ( + ihipGetTlsDefaultCtx() && ihipGetTlsDefaultCtx()->getDevice()) { + return ihipGetDevice( + ihipGetTlsDefaultCtx()->getDevice()->_deviceId)->_hsaAgent; + } + else { + return *static_cast( + accelerator{}.get_default_view().get_hsa_agent()); } } } - void unlock_stream_hip_( + void hipLaunchKernelGGLImpl( + uintptr_t function_address, + const dim3& numBlocks, + const dim3& dimBlocks, + uint32_t sharedMemBytes, hipStream_t stream, - void* locked_stream, - const char* kernel_name, - hc::accelerator_view* acc_v) - { // Precondition: acc_v is the accelerator_view associated with stream - // which is guarded by locked_stream; - // locked_stream is deletable. - using L = decltype(stream->lockopen_preKernelCommand()); + void** kernarg) + { + const auto it0 = functions().find(function_address); - stream->lockclose_postKernelCommand(kernel_name, acc_v); + if (it0 == functions().cend()) { + throw runtime_error{ + "No device code available for function: " + + name(function_address) + }; + } - delete static_cast(locked_stream); - locked_stream = nullptr; + auto agent = target_agent(stream); + + const auto it1 = find_if( + it0->second.cbegin(), + it0->second.cend(), + [=](const pair& x) { + return x.first.handle == agent.handle; + }); + + if (it1 == it0->second.cend()) { + throw runtime_error{ + "No code available for function: " + name(function_address) + + ", for agent: " + name(agent) + }; + } + + for (auto&& agent_kernel : it0->second) { + if (agent.handle == agent_kernel.first.handle) { + hipModuleLaunchKernel( + agent_kernel.second, + numBlocks.x, + numBlocks.y, + numBlocks.z, + dimBlocks.x, + dimBlocks.y, + dimBlocks.z, + sharedMemBytes, + stream, + nullptr, + kernarg); + } + } } } diff --git a/src/hip_hcc_internal.h b/src/hip_hcc_internal.h index 197cd35bfa..503bebcd6a 100644 --- a/src/hip_hcc_internal.h +++ b/src/hip_hcc_internal.h @@ -233,7 +233,7 @@ static const DbName dbName [] = #if COMPILE_HIP_DB #define tprintf(trace_level, ...) {\ if (HIP_DB & (1<<(trace_level))) {\ - char msgStr[1000];\ + char msgStr[2000];\ snprintf(msgStr, 2000, __VA_ARGS__);\ fprintf (stderr, " %ship-%s tid:%d:%s%s", dbName[trace_level]._color, dbName[trace_level]._shortName, tls_tidInfo.tid(), msgStr, KNRM); \ }\ diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index a8324c5729..96fc25c27d 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -65,7 +65,7 @@ int sharePtr(void *ptr, ihipCtx_t *ctx, bool shareWithAll, unsigned hipFlags) if (shareWithAll) { hsa_status_t s = hsa_amd_agents_allow_access(g_deviceCnt+1, g_allAgents, NULL, ptr); - tprintf (DB_MEM, " allow access to CPU + all %d GPUs (shareWithAll)\n", g_deviceCnt); + tprintf (DB_MEM, " allow access to CPU + all %d GPUs (shareWithAll)\n", g_deviceCnt); if (s != HSA_STATUS_SUCCESS) { ret = -1; } @@ -122,7 +122,7 @@ void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, bool if (HIP_INIT_ALLOC != -1) { // TODO , dont' call HIP API directly here: hipMemset(ptr, HIP_INIT_ALLOC, sizeBytes); - } + } if (ptr != nullptr) { int r = sharePtr(ptr, ctx, shareWithAll, hipFlags); @@ -251,7 +251,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) hip_status = hipErrorMemoryAllocation; } - } + } return ihipLogStatus(hip_status); @@ -284,10 +284,10 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) } - const unsigned supportedFlags = hipHostMallocPortable - | hipHostMallocMapped - | hipHostMallocWriteCombined - | hipHostMallocCoherent + const unsigned supportedFlags = hipHostMallocPortable + | hipHostMallocMapped + | hipHostMallocWriteCombined + | hipHostMallocCoherent | hipHostMallocNonCoherent; @@ -300,7 +300,7 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) hip_status = hipErrorInvalidValue; } else { auto device = ctx->getWriteableDevice(); - + unsigned amFlags = 0; if (flags & hipHostMallocCoherent) { amFlags = amHostCoherent; @@ -581,7 +581,7 @@ hipError_t hipMalloc3DArray(hipArray_t *array, hsa_ext_image_data_info_t imageInfo; hsa_status_t status = hsa_ext_image_data_get_info(*agent, &imageDescriptor, permission, &imageInfo); size_t alignment = imageInfo.alignment <= allocGranularity ? 0 : imageInfo.alignment; - + *ptr = hip_internal::allocAndSharePtr("device_array", allocSize, ctx, false, am_flags, 0, alignment); if (size && (*ptr == NULL)) { @@ -1585,7 +1585,7 @@ hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* devPtr){ HIP_INIT_API ( handle, devPtr); hipError_t hipStatus = hipSuccess; // Get the size of allocated pointer - size_t psize; + size_t psize = 0u; hc::accelerator acc; if((handle == NULL) || (devPtr == NULL)) { hipStatus = hipErrorInvalidResourceHandle; diff --git a/src/hip_module.cpp b/src/hip_module.cpp index e9e572af9b..a77ee48a53 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -119,15 +119,18 @@ namespace hipdrv { uint64_t PrintSymbolSizes(const void *emi, const char *name){ using namespace ELFIO; - const Elf64_Ehdr *ehdr = (const Elf64_Ehdr*)emi; + const ELFIO::Elf64_Ehdr *ehdr = (const ELFIO::Elf64_Ehdr*)emi; if(NULL == ehdr || EV_CURRENT != ehdr->e_version){} - const Elf64_Shdr * shdr = (const Elf64_Shdr*)((char*)emi + ehdr->e_shoff); + const ELFIO::Elf64_Shdr * shdr = + (const ELFIO::Elf64_Shdr*)((char*)emi + ehdr->e_shoff); for(uint16_t i=0;ie_shnum;++i){ if(shdr[i].sh_type == SHT_SYMTAB){ - const Elf64_Sym *syms = (const Elf64_Sym*)((char*)emi + shdr[i].sh_offset); + const ELFIO::Elf64_Sym *syms = + (const ELFIO::Elf64_Sym*)((char*)emi + shdr[i].sh_offset); assert(syms); uint64_t numSyms = shdr[i].sh_size/shdr[i].sh_entsize; - const char* strtab = (const char*)((char*)emi + shdr[shdr[i].sh_link].sh_offset); + const char* strtab = + (const char*)((char*)emi + shdr[shdr[i].sh_link].sh_offset); assert(strtab); for(uint64_t i=0;ie_shoff); + const ELFIO::Elf64_Ehdr *ehdr = (const ELFIO::Elf64_Ehdr*)emi; + const ELFIO::Elf64_Shdr *shdr = (const ELFIO::Elf64_Shdr*)((char*)emi + ehdr->e_shoff); uint64_t max_offset = ehdr->e_shoff; uint64_t total_size = max_offset + ehdr->e_shentsize * ehdr->e_shnum; @@ -164,156 +167,8 @@ uint64_t ElfSize(const void *emi){ return total_size; } -namespace -{ - template - inline - ELFIO::section* find_section_if(ELFIO::elfio& reader, P p) - { - using namespace std; - - const auto it = find_if( - reader.sections.begin(), reader.sections.end(), move(p)); - - return it != reader.sections.end() ? *it : nullptr; - } - - inline - std::vector copy_names_of_undefined_symbols( - const ELFIO::symbol_section_accessor& section) - { - using namespace ELFIO; - using namespace std; - - vector r; - - for (auto i = 0u; i != section.get_symbols_num(); ++i) { - // TODO: this is boyscout code, caching the temporaries - // may be of worth. - string name; - Elf64_Addr value = 0; - Elf_Xword size = 0; - Elf_Half sect_idx = 0; - uint8_t bind = 0; - uint8_t type = 0; - uint8_t other = 0; - - section.get_symbol( - i, name, value, size, bind, type, sect_idx, other); - - if (sect_idx == SHN_UNDEF && !name.empty()) { - r.push_back(std::move(name)); - } - } - - return r; - } - - inline - std::pair find_symbol_address( - const ELFIO::symbol_section_accessor& section, - const std::string& symbol_name) - { - using namespace ELFIO; - using namespace std; - - static constexpr pair r{0, 0}; - - for (auto i = 0u; i != section.get_symbols_num(); ++i) { - // TODO: this is boyscout code, caching the temporaries - // may be of worth. - string name; - Elf64_Addr value = 0; - Elf_Xword size = 0; - Elf_Half sect_idx = 0; - uint8_t bind = 0; - uint8_t type = 0; - uint8_t other = 0; - - section.get_symbol( - i, name, value, size, bind, type, sect_idx, other); - - if (name == symbol_name) return make_pair(value, size); - } - - return r; - } - - inline - void associate_code_object_symbols_with_host_allocation( - const ELFIO::elfio& reader, - const ELFIO::elfio& self_reader, - ELFIO::section* code_object_dynsym, - ELFIO::section* process_symtab, - hsa_agent_t agent, - hsa_executable_t executable) - { - using namespace ELFIO; - using namespace std; - - if (!code_object_dynsym || !process_symtab) return; - - const auto undefined_symbols = copy_names_of_undefined_symbols( - symbol_section_accessor{reader, code_object_dynsym}); - - for (auto&& x : undefined_symbols) { - const auto tmp = find_symbol_address( - symbol_section_accessor{self_reader, process_symtab}, x); - - assert(tmp.first); - - void* p = nullptr; - hsa_amd_memory_lock( - reinterpret_cast(tmp.first), tmp.second, &agent, 1, &p); - - hsa_executable_agent_global_variable_define( - executable, agent, x.c_str(), p); - - static vector< - unique_ptr> globals; - static mutex mtx; - - lock_guard lck{mtx}; - globals.emplace_back(p, hsa_amd_memory_unlock); - } - } - - inline - void load_code_object_and_freeze_executable( - const char* file, hsa_agent_t agent, hsa_executable_t executable) - { // TODO: the following sequence is inefficient, should be refactored - // into a single load of the file and subsequent ELFIO - // processing. - using namespace std; - - static const auto cor_deleter = [](hsa_code_object_reader_t* p) { - hsa_code_object_reader_destroy(*p); - }; - - using RAII_code_reader = unique_ptr< - hsa_code_object_reader_t, decltype(cor_deleter)>; - - unique_ptr cobj{fopen(file, "r"), fclose}; - RAII_code_reader tmp{new hsa_code_object_reader_t, cor_deleter}; - hsa_code_object_reader_create_from_file(fileno(cobj.get()), tmp.get()); - - hsa_executable_load_agent_code_object( - executable, agent, *tmp, nullptr, nullptr); - - hsa_executable_freeze(executable, nullptr); - - static vector code_readers; - static mutex mtx; - - lock_guard lck{mtx}; - code_readers.push_back(move(tmp)); - } -} - hipError_t hipModuleLoad(hipModule_t *module, const char *fname) { - using namespace ELFIO; - HIP_INIT_API(module, fname); hipError_t ret = hipSuccess; *module = new ihipModule_t; @@ -336,36 +191,14 @@ hipError_t hipModuleLoad(hipModule_t *module, const char *fname) nullptr, &(*module)->executable); - elfio reader; - if (!reader.load(fname)) { + std::ifstream file{fname}; + + if (!file.is_open()) { return ihipLogStatus(hipErrorFileNotFound); } - else { - // TODO: this may benefit from caching as well. - elfio self_reader; - self_reader.load("/proc/self/exe"); - - const auto symtab = - find_section_if(self_reader, [](const ELFIO::section* x) { - return x->get_type() == SHT_SYMTAB; - }); - - const auto code_object_dynsym = - find_section_if(reader, [](const ELFIO::section* x) { - return x->get_type() == SHT_DYNSYM; - }); - - associate_code_object_symbols_with_host_allocation( - reader, - self_reader, - code_object_dynsym, - symtab, - currentDevice->_hsaAgent, - (*module)->executable); - - load_code_object_and_freeze_executable( - fname, currentDevice->_hsaAgent, (*module)->executable); - } + (*module)->executable = hip_impl::load_executable( + (*module)->executable, currentDevice->_hsaAgent, file); + ret = (*module)->executable.handle ? hipSuccess : hipErrorUnknown; } return ihipLogStatus(ret); diff --git a/src/program_state.cpp b/src/program_state.cpp new file mode 100644 index 0000000000..be871a6e84 --- /dev/null +++ b/src/program_state.cpp @@ -0,0 +1,498 @@ +#include "../include/hip/hcc_detail/program_state.hpp" + +#include "../include/hip/hcc_detail/code_object_bundle.hpp" + +#include "hip_hcc_internal.h" +#include "trace_helper.h" + +#include "elfio/elfio.hpp" + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace ELFIO; +using namespace hip_impl; +using namespace std; + +namespace std +{ + template<> + struct hash { + size_t operator()(hsa_agent_t x) const + { + return hash{}(x.handle); + } + }; + + template<> + struct hash { + size_t operator()(hsa_isa_t x) const + { + return hash{}(x.handle); + } + }; +} + +inline +constexpr +bool operator==(hsa_agent_t x, hsa_agent_t y) +{ + return x.handle == y.handle; +} + +inline +constexpr +bool operator==(hsa_isa_t x, hsa_isa_t y) +{ + return x.handle == y.handle; +} + +namespace +{ + vector copy_names_of_undefined_symbols( + const symbol_section_accessor& section) + { + vector r; + + for (auto i = 0u; i != section.get_symbols_num(); ++i) { + // TODO: this is boyscout code, caching the temporaries + // may be of worth. + string name; + Elf64_Addr value = 0; + Elf_Xword size = 0; + Elf_Half sect_idx = 0; + uint8_t bind = 0; + uint8_t type = 0; + uint8_t other = 0; + + section.get_symbol( + i, name, value, size, bind, type, sect_idx, other); + + if (sect_idx == SHN_UNDEF && !name.empty()) { + r.push_back(std::move(name)); + } + } + + return r; + } + + pair find_symbol_address( + const symbol_section_accessor& section, + const string& symbol_name) + { + static constexpr pair r{0, 0}; + + for (auto i = 0u; i != section.get_symbols_num(); ++i) { + // TODO: this is boyscout code, caching the temporaries + // may be of worth. + string name; + Elf64_Addr value = 0; + Elf_Xword size = 0; + Elf_Half sect_idx = 0; + uint8_t bind = 0; + uint8_t type = 0; + uint8_t other = 0; + + section.get_symbol( + i, name, value, size, bind, type, sect_idx, other); + + if (name == symbol_name) return make_pair(value, size); + } + + return r; + } + + void associate_code_object_symbols_with_host_allocation( + const elfio& reader, + const elfio& self_reader, + section* code_object_dynsym, + section* process_symtab, + hsa_agent_t agent, + hsa_executable_t executable) + { + if (!code_object_dynsym || !process_symtab) return; + + const auto undefined_symbols = copy_names_of_undefined_symbols( + symbol_section_accessor{reader, code_object_dynsym}); + + for (auto&& x : undefined_symbols) { + const auto tmp = find_symbol_address( + symbol_section_accessor{self_reader, process_symtab}, x); + + assert(tmp.first); + + void* p = nullptr; + hsa_amd_memory_lock( + reinterpret_cast(tmp.first), tmp.second, &agent, 1, &p); + + hsa_executable_agent_global_variable_define( + executable, agent, x.c_str(), p); + + static vector< + unique_ptr> globals; + static mutex mtx; + + lock_guard lck{mtx}; + globals.emplace_back(p, hsa_amd_memory_unlock); + } + } + + template + inline + section* find_section_if(elfio& reader, P p) + { + const auto it = find_if( + reader.sections.begin(), reader.sections.end(), std::move(p)); + + return it != reader.sections.end() ? *it : nullptr; + } + + vector code_object_blob_for_process() + { + static constexpr const char self[] = "/proc/self/exe"; + static constexpr const char kernel_section[] = ".kernel"; + + elfio reader; + + if (!reader.load(self)) { + throw runtime_error{"Failed to load ELF file for current process."}; + } + + auto kernels = find_section_if(reader, [](const section* x) { + return x->get_name() == kernel_section; + }); + + vector r; + if (kernels) { + r.insert( + r.end(), + kernels->get_data(), + kernels->get_data() + kernels->get_size()); + } + + return r; + } + + const unordered_map>>& code_object_blobs() + { + static unordered_map>> r; + static once_flag f; + + call_once(f, []() { + static vector> blobs{ + code_object_blob_for_process()}; + + dl_iterate_phdr([](dl_phdr_info* i, std::size_t, void*) { + elfio tmp; + if (tmp.load(i->dlpi_name)) { + const auto it = find_section_if(tmp, [](const section* x) { + return x->get_name() == ".kernel"; + }); + + if (it) blobs.emplace_back( + it->get_data(), it->get_data() + it->get_size()); + } + return 0; + }, nullptr); + + for (auto&& blob : blobs) { + Bundled_code_header tmp{blob}; + if (valid(tmp)) { + for (auto&& bundle : bundles(tmp)) { + r[triple_to_hsa_isa(bundle.triple)] + .push_back(bundle.blob); + } + } + } + }); + + return r; + } + + const unordered_map>& executables() + { + static unordered_map> r; + static once_flag f; + + call_once(f, []() { + static const auto accelerators = hc::accelerator::get_all(); + + for (auto&& acc : accelerators) { + auto agent = static_cast(acc.get_hsa_agent()); + + if (!agent) continue; + + hsa_agent_iterate_isas(*agent, [](hsa_isa_t x, void* pa) { + const auto it = code_object_blobs().find(x); + + if (it != code_object_blobs().cend()) { + hsa_agent_t a = *static_cast(pa); + + for (auto&& blob : it->second) { + hsa_executable_t tmp = {}; + + hsa_executable_create_alt( + HSA_PROFILE_FULL, + HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, + nullptr, + &tmp); + + // TODO: this is massively inefficient and only + // meant for illustration. + string blob_to_str{blob.cbegin(), blob.cend()}; + stringstream istr{blob_to_str}; + tmp = load_executable(tmp, a, istr); + + if (tmp.handle) r[a].push_back(tmp); + } + } + + return HSA_STATUS_SUCCESS; + }, agent); + } + }); + + cout << r.size() << endl; + return r; + } + + inline + hsa_agent_t agent(hsa_executable_symbol_t x) + { + hsa_agent_t r = {}; + hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_AGENT, &r); + + return r; + } + + inline + uint32_t group_size(hsa_executable_symbol_t x) + { + uint32_t r = 0u; + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &r); + + return r; + } + + inline + uint64_t kernel_object(hsa_executable_symbol_t x) + { + uint64_t r = 0u; + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &r); + + return r; + } + + inline + string name(hsa_executable_symbol_t x) + { + uint32_t sz = 0u; + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &sz); + + string r(sz, '\0'); + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_NAME, &r.front()); + + return r; + } + + inline + uint32_t private_size(hsa_executable_symbol_t x) + { + uint32_t r = 0u; + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &r); + + return r; + } + + inline + hsa_symbol_kind_t type(hsa_executable_symbol_t x) + { + hsa_symbol_kind_t r = {}; + hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &r); + + return r; + } + + const unordered_map>& kernels() + { + static unordered_map> r; + static once_flag f; + + call_once(f, []() { + static const auto copy_kernels = []( + hsa_executable_t, hsa_agent_t, hsa_executable_symbol_t s, void*) { + if (type(s) == HSA_SYMBOL_KIND_KERNEL) r[name(s)].push_back(s); + + return HSA_STATUS_SUCCESS; + }; + + for (auto&& agent_executables : executables()) { + for (auto&& executable : agent_executables.second) { + hsa_executable_iterate_agent_symbols( + executable, + agent_executables.first, + copy_kernels, + nullptr); + } + } + }); + + return r; + } + + void load_code_object_and_freeze_executable( + istream& file, hsa_agent_t agent, hsa_executable_t executable) + { // TODO: the following sequence is inefficient, should be refactored + // into a single load of the file and subsequent ELFIO + // processing. + static const auto cor_deleter = [](hsa_code_object_reader_t* p) { + hsa_code_object_reader_destroy(*p); + }; + + using RAII_code_reader = unique_ptr< + hsa_code_object_reader_t, decltype(cor_deleter)>; + + file.seekg(0); + + vector blob{ + istreambuf_iterator{file}, istreambuf_iterator{}}; + RAII_code_reader tmp{new hsa_code_object_reader_t, cor_deleter}; + hsa_code_object_reader_create_from_memory( + blob.data(), blob.size(), tmp.get()); + + hsa_executable_load_agent_code_object( + executable, agent, *tmp, nullptr, nullptr); + + hsa_executable_freeze(executable, nullptr); + + static vector code_readers; + static mutex mtx; + + lock_guard lck{mtx}; + code_readers.push_back(move(tmp)); + } +} + +namespace hip_impl +{ + const unordered_map& function_names() + { + static constexpr const char self[] = "/proc/self/exe"; + + static unordered_map r; + static once_flag f; + + call_once(f, []() { + elfio reader; + + if (!reader.load(self)) { + throw runtime_error{ + "Failed to load the ELF file for the current process."}; + } + + auto symtab = find_section_if(reader, [](const section* x) { + return x->get_type() == SHT_SYMTAB; + }); + + symbol_section_accessor symbols{reader, symtab}; + + for (auto i = 0u; i != symbols.get_symbols_num(); ++i) { + // TODO: this is boyscout code, caching the temporaries + // may be of worth. + string name; + Elf64_Addr value = 0; + Elf_Xword size = 0; + Elf_Half sect_idx = 0; + uint8_t bind = 0; + uint8_t type = 0; + uint8_t other = 0; + + symbols.get_symbol( + i, name, value, size, bind, type, sect_idx, other); + + if (type == STT_FUNC && sect_idx != SHN_UNDEF && !name.empty()) { + r.emplace(value, name); + } + } + }); + + return r; + } + + const unordered_map< + uintptr_t, vector>>& functions() + { + static unordered_map< + uintptr_t, vector>> r; + static once_flag f; + + call_once(f, []() { + for (auto&& function : function_names()) { + const auto it = kernels().find(function.second); + + if (it != kernels().cend()) { + for (auto&& kernel_symbol : it->second) { + r[function.first].emplace_back( + agent(kernel_symbol), + Kernel_descriptor{ + kernel_object(kernel_symbol), + group_size(kernel_symbol), + private_size(kernel_symbol), + it->first}); + } + } + } + }); + + return r; + } + + hsa_executable_t load_executable( + hsa_executable_t executable, hsa_agent_t agent, istream& file) + { + elfio reader; + if (!reader.load(file)) { + return hsa_executable_t{}; + } + else { + // TODO: this may benefit from caching as well. + elfio self_reader; + self_reader.load("/proc/self/exe"); + + const auto symtab = + find_section_if(self_reader, [](const ELFIO::section* x) { + return x->get_type() == SHT_SYMTAB; + }); + + const auto code_object_dynsym = + find_section_if(reader, [](const ELFIO::section* x) { + return x->get_type() == SHT_DYNSYM; + }); + + associate_code_object_symbols_with_host_allocation( + reader, self_reader, code_object_dynsym, symtab, agent, executable); + + load_code_object_and_freeze_executable(file, agent, executable); + + return executable; + } + } +} // Namespace hip_impl. \ No newline at end of file diff --git a/tests/src/context/hipMemsetD8.cpp b/tests/src/context/hipMemsetD8.cpp index 3730fcb70b..a356d05b76 100644 --- a/tests/src/context/hipMemsetD8.cpp +++ b/tests/src/context/hipMemsetD8.cpp @@ -46,7 +46,6 @@ int main(int argc, char *argv[]) A_h = new char[Nbytes]; HIPCHECK ( hipMalloc((void **) &A_d, Nbytes) ); - A_h = (char*)malloc(Nbytes); printf ("Size=%zu memsetval=%2x \n", Nbytes, memsetval); HIPCHECK ( hipMemsetD8(A_d, memsetval, Nbytes) ); @@ -61,7 +60,7 @@ int main(int argc, char *argv[]) } hipFree((void *) A_d); - free(A_h); + delete [] A_h; passed(); } diff --git a/tests/src/deviceLib/hipTestDevice.cpp b/tests/src/deviceLib/hipTestDevice.cpp index 570f3baaf0..fa85940839 100644 --- a/tests/src/deviceLib/hipTestDevice.cpp +++ b/tests/src/deviceLib/hipTestDevice.cpp @@ -139,7 +139,14 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); + if(passed == 1){ return true; } @@ -174,7 +181,14 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); + if(passed == 1){ return true; } @@ -205,7 +219,13 @@ for(int i=0;i<512;i++){ } } -free(A); +delete [] A; +delete [] B; +delete [] C; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); + if(passed == 1){ return true; } @@ -234,7 +254,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -263,7 +288,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -291,7 +321,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -321,7 +356,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -350,7 +390,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -387,7 +432,16 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +delete [] D; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); +hipFree(Dd); + if(passed == 1){ return true; } @@ -427,7 +481,18 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +delete [] D; +delete [] E; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); +hipFree(Dd); +hipFree(Ed); + if(passed == 1){ return true; } @@ -457,7 +522,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -489,7 +559,14 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); + if(passed == 1){ return true; } @@ -525,7 +602,16 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +delete [] D; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); +hipFree(Dd); + if(passed == 1){ return true; } @@ -565,7 +651,18 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +delete [] D; +delete [] E; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); +hipFree(Dd); +hipFree(Ed); + if(passed == 1){ return true; } @@ -595,7 +692,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -622,7 +724,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -631,7 +738,7 @@ return false; } int main(){ -if(run_sincosf() && run_sincospif() && run_fdividef() && +if(run_sincosf() && run_sincospif() && run_fdividef() && run_llrintf() && run_norm3df() && run_norm4df() && run_normf() && run_rnorm3df() && run_rnorm4df() && run_rnormf() && run_lroundf() && run_llroundf() && diff --git a/tests/src/deviceLib/hipTestDeviceDouble.cpp b/tests/src/deviceLib/hipTestDeviceDouble.cpp index 5bdbbf1b8f..3b919d0cab 100644 --- a/tests/src/deviceLib/hipTestDeviceDouble.cpp +++ b/tests/src/deviceLib/hipTestDeviceDouble.cpp @@ -128,7 +128,14 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); + if(passed == 1){ return true; } @@ -163,7 +170,14 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); + if(passed == 1){ return true; } @@ -193,7 +207,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -221,7 +240,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -249,7 +273,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -278,7 +307,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -306,7 +340,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -343,7 +382,16 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +delete [] D; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); +hipFree(Dd); + if(passed == 1){ return true; } @@ -383,7 +431,18 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +delete [] D; +delete [] E; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); +hipFree(Dd); +hipFree(Ed); + if(passed == 1){ return true; } @@ -416,7 +475,14 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); + if(passed == 1){ return true; } @@ -452,7 +518,16 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +delete [] D; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); +hipFree(Dd); + if(passed == 1){ return true; } @@ -492,7 +567,18 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +delete [] D; +delete [] E; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); +hipFree(Dd); +hipFree(Ed); + if(passed == 1){ return true; } @@ -522,7 +608,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -549,7 +640,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } diff --git a/tests/src/deviceLib/hip_test_ldg.cpp b/tests/src/deviceLib/hip_test_ldg.cpp index 171ff1afd0..5540c4917d 100644 --- a/tests/src/deviceLib/hip_test_ldg.cpp +++ b/tests/src/deviceLib/hip_test_ldg.cpp @@ -159,11 +159,16 @@ bool dataTypesRun(){ HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice)); - hipLaunchKernel(vectoradd_float, - dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), - dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), - 0, 0, - deviceA ,deviceB ,WIDTH ,HEIGHT); + hipLaunchKernel( + vectoradd_float, + dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), + 0, + 0, + deviceA, + static_cast(deviceB), + WIDTH, + HEIGHT); HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(T), hipMemcpyDeviceToHost)); @@ -221,11 +226,16 @@ bool dataTypesRun2(){ HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T))); HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice)); - hipLaunchKernel(vectoradd_float, - dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), - dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), - 0, 0, - deviceA ,deviceB,WIDTH ,HEIGHT); + hipLaunchKernel( + vectoradd_float, + dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), + 0, + 0, + deviceA, + static_cast(deviceB), + WIDTH, + HEIGHT); HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(T), hipMemcpyDeviceToHost)); @@ -281,11 +291,16 @@ bool dataTypesRun4(){ HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice)); - hipLaunchKernel(vectoradd_float, - dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), - dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), - 0, 0, - deviceA ,deviceB ,WIDTH ,HEIGHT); + hipLaunchKernel( + vectoradd_float, + dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), + 0, + 0, + deviceA, + static_cast(deviceB), + WIDTH, + HEIGHT); HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(T), hipMemcpyDeviceToHost)); diff --git a/tests/src/experimental/xcompile/hipxxKer.cpp b/tests/src/experimental/xcompile/hipxxKer.cpp index 79a272aaf2..d1bbed63cd 100644 --- a/tests/src/experimental/xcompile/hipxxKer.cpp +++ b/tests/src/experimental/xcompile/hipxxKer.cpp @@ -36,17 +36,23 @@ __global__ void Kern(hipLaunchParm lp, float *A) int main() { - float *A, *Ad; + float A[len]; + float *Ad; + for(int i=0;i(A_d), + static_cast(B_d), + C_d, + N); HIPCHECK (hipEventRecord(stop, NULL)); diff --git a/tests/src/runtimeApi/event/record_event.cpp b/tests/src/runtimeApi/event/record_event.cpp index bd8a3ada8e..a7b99749cb 100644 --- a/tests/src/runtimeApi/event/record_event.cpp +++ b/tests/src/runtimeApi/event/record_event.cpp @@ -52,7 +52,7 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_ if (!(testMask & p_tests)) { return; } - printf ("\ntest 0x%3x: stream=%p waitStart=%d syncMode=%s\n", + printf ("\ntest 0x%3x: stream=%p waitStart=%d syncMode=%s\n", testMask, stream, waitStart, syncModeString(syncMode)); size_t sizeBytes = numElements * sizeof(int); @@ -77,7 +77,16 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_ HIPCHECK(hipEventRecord(timingDisabled, stream)); // sandwhich a kernel: HIPCHECK(hipEventRecord(start, stream)); - hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, stream, C_d, C_h, numElements, count); + hipLaunchKernelGGL( + HipTest::addCountReverse, + dim3(blocks), + dim3(threadsPerBlock), + 0, + stream, + static_cast(C_d), + C_h, + numElements, + count); HIPCHECK(hipEventRecord(stop, stream)); @@ -85,8 +94,8 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_ HIPCHECK(hipEventSynchronize(start)); } - - hipError_t expectedStopError = hipSuccess; + + hipError_t expectedStopError = hipSuccess; // How to wait for the events to finish: switch (syncMode) { @@ -97,12 +106,12 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_ HIPCHECK(hipStreamSynchronize(stream)); // wait for recording to finish... break; case syncStopEvent: - HIPCHECK(hipEventSynchronize(stop)); + HIPCHECK(hipEventSynchronize(stop)); break; default: assert(0); }; - + float t; @@ -111,25 +120,25 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_ failed ("start event not in expected state, was %d=%s\n", e, hipGetErrorName(e)); } - if (e == hipSuccess) + if (e == hipSuccess) assert (t==0.0f); - + // stop usually ready unless we skipped the synchronization (syncNone) HIPCHECK_API(hipEventElapsedTime(&t, stop, stop), expectedStopError); - if (e == hipSuccess) + if (e == hipSuccess) assert (t==0.0f); e = hipEventElapsedTime(&t, start, stop); HIPCHECK_API(e, expectedStopError); - if (expectedStopError == hipSuccess) + if (expectedStopError == hipSuccess) assert (t>0.0f); printf ("time=%6.2f error=%s\n", t, hipGetErrorName(e)); e = hipEventElapsedTime(&t, stop, start); HIPCHECK_API(e, expectedStopError); - if (expectedStopError == hipSuccess) + if (expectedStopError == hipSuccess) assert (t<0.0f); printf ("negtime=%6.2f error=%s\n", t, hipGetErrorName(e)); diff --git a/tests/src/runtimeApi/memory/hipMemcpy.cpp b/tests/src/runtimeApi/memory/hipMemcpy.cpp index e8e803e44c..b3f25658fc 100644 --- a/tests/src/runtimeApi/memory/hipMemcpy.cpp +++ b/tests/src/runtimeApi/memory/hipMemcpy.cpp @@ -58,7 +58,7 @@ public: void offset(int offset) { _offset = offset; }; int offset() const { return _offset; }; - + private: T * _A_d; T* _B_d; @@ -72,7 +72,7 @@ private: template DeviceMemory::DeviceMemory(size_t numElements) - : _maxNumElements(numElements), + : _maxNumElements(numElements), _offset(0) { T ** np = nullptr; @@ -93,7 +93,7 @@ DeviceMemory::~DeviceMemory () HipTest::freeArrays (_A_d, _B_d, _C_d, np, np, np, 0); HIPCHECK (hipFree(_C_dd)); - + _C_dd = NULL; }; @@ -125,7 +125,7 @@ public: T * A_hh; T* B_hh; - bool _usePinnedHost; + bool _usePinnedHost; private: size_t _maxNumElements; @@ -165,11 +165,11 @@ HostMemory::HostMemory(size_t numElements, bool usePinnedHost) template void -HostMemory::reset(size_t numElements, bool full) +HostMemory::reset(size_t numElements, bool full) { // Initialize the host data: for (size_t i=0; i void memcpytest2(DeviceMemory *dmem, HostMemory *hmem, size_t numElements, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) { size_t sizeElements = numElements * sizeof(T); - printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d, offsets:dev:%+d host:+%d\n", - __func__, + printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d, offsets:dev:%+d host:+%d\n", + __func__, TYPENAME(T), sizeElements, sizeElements/1024.0/1024.0, hmem->_usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault, @@ -243,7 +243,16 @@ void memcpytest2(DeviceMemory *dmem, HostMemory *hmem, size_t numElements, HIPCHECK ( hipMemcpy(dmem->B_d(), hmem->B_h(), sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); } - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, dmem->A_d(), dmem->B_d(), dmem->C_d(), numElements); + hipLaunchKernel( + HipTest::vectorADD, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(dmem->A_d()), + static_cast(dmem->B_d()), + dmem->C_d(), + numElements); if (useDeviceToDevice) { // Do an extra device-to-device copy here to mix things up: @@ -273,8 +282,8 @@ void memcpytest2_for_type(size_t numElements) { printSep(); - DeviceMemory memD(numElements); - HostMemory memU(numElements, 0/*usePinnedHost*/); + DeviceMemory memD(numElements); + HostMemory memU(numElements, 0/*usePinnedHost*/); HostMemory memP(numElements, 1/*usePinnedHost*/); for (int usePinnedHost =0; usePinnedHost<=1; usePinnedHost++) { @@ -307,11 +316,11 @@ void memcpytest2_sizes(size_t maxElem=0) maxElem = free/sizeof(T)/8; } - printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n", + printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n", deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0); HIPCHECK ( hipDeviceReset() ); - DeviceMemory memD(maxElem); - HostMemory memU(maxElem, 0/*usePinnedHost*/); + DeviceMemory memD(maxElem); + HostMemory memU(maxElem, 0/*usePinnedHost*/); HostMemory memP(maxElem, 1/*usePinnedHost*/); for (size_t elem=1; elem<=maxElem; elem*=2) { @@ -336,11 +345,11 @@ void memcpytest2_offsets(size_t maxElem, bool devOffsets, bool hostOffsets) HIPCHECK(hipMemGetInfo(&free, &total)); - printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n", + printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n", deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0); HIPCHECK ( hipDeviceReset() ); - DeviceMemory memD(maxElem); - HostMemory memU(maxElem, 0/*usePinnedHost*/); + DeviceMemory memD(maxElem); + HostMemory memU(maxElem, 0/*usePinnedHost*/); HostMemory memP(maxElem, 1/*usePinnedHost*/); size_t elem = maxElem / 2; @@ -380,16 +389,16 @@ void multiThread_1(bool serialize, bool usePinnedHost) { printSep(); printf ("test: %s<%s> serialize=%d usePinnedHost=%d\n", __func__, TYPENAME(T), serialize, usePinnedHost); - DeviceMemory memD(N); - HostMemory mem1(N, usePinnedHost); - HostMemory mem2(N, usePinnedHost); + DeviceMemory memD(N); + HostMemory mem1(N, usePinnedHost); + HostMemory mem2(N, usePinnedHost); std::thread t1 (memcpytest2, &memD, &mem1, N, 0,0,0); if (serialize) { t1.join(); } - + std::thread t2 (memcpytest2,&memD, &mem2, N, 0,0,0); if (serialize) { t2.join(); @@ -427,21 +436,21 @@ int main(int argc, char *argv[]) // Some tests around the 64KB boundary which have historically shown issues: printf ("\n\n=== tests&0x2 (64KB boundary)\n"); size_t maxElem = 32*1024*1024; - DeviceMemory memD(maxElem); - HostMemory memU(maxElem, 0/*usePinnedHost*/); - HostMemory memP(maxElem, 0/*usePinnedHost*/); + DeviceMemory memD(maxElem); + HostMemory memU(maxElem, 0/*usePinnedHost*/); + HostMemory memP(maxElem, 0/*usePinnedHost*/); // These all pass: - memcpytest2(&memD, &memP, 15*1024*1024, 0, 0, 0); - memcpytest2(&memD, &memP, 16*1024*1024, 0, 0, 0); - memcpytest2(&memD, &memP, 16*1024*1024+16*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 15*1024*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 16*1024*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 16*1024*1024+16*1024, 0, 0, 0); // Just over 64MB: - memcpytest2(&memD, &memP, 16*1024*1024+512*1024, 0, 0, 0); - memcpytest2(&memD, &memP, 17*1024*1024+1024, 0, 0, 0); - memcpytest2(&memD, &memP, 32*1024*1024, 0, 0, 0); - memcpytest2(&memD, &memU, 32*1024*1024, 0, 0, 0); - memcpytest2(&memD, &memP, 32*1024*1024, 1, 1, 0); - memcpytest2(&memD, &memP, 32*1024*1024, 1, 1, 0); + memcpytest2(&memD, &memP, 16*1024*1024+512*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 17*1024*1024+1024, 0, 0, 0); + memcpytest2(&memD, &memP, 32*1024*1024, 0, 0, 0); + memcpytest2(&memD, &memU, 32*1024*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 32*1024*1024, 1, 1, 0); + memcpytest2(&memD, &memP, 32*1024*1024, 1, 1, 0); } @@ -464,7 +473,7 @@ int main(int argc, char *argv[]) // Simplest cases: serialize the threads, and also used pinned memory: // This verifies that the sub-calls to memcpytest2 are correct. - multiThread_1(true, true); + multiThread_1(true, true); // Serialize, but use unpinned memory to stress the unpinned memory xfer path. multiThread_1(true, false); diff --git a/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp b/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp index 316f50c01b..9a09e7e95c 100644 --- a/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp +++ b/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp @@ -63,7 +63,16 @@ void simpleTest1() HIPCHECK ( memcopy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); HIPCHECK ( memcopy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N); + hipLaunchKernel( + HipTest::vectorADD, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(A_d), + static_cast(B_d), + C_d, + N); HIPCHECK ( memcopy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); diff --git a/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp b/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp index 4f73b67ad7..9d274543ab 100644 --- a/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp +++ b/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp @@ -41,8 +41,8 @@ void printSep() // Designed to stress a small number of simple smoke tests template< - typename T=float, - class P=HipTest::Unpinned, + typename T=float, + class P=HipTest::Unpinned, class C=HipTest::Memcpy > void simpleVectorAdd(size_t numElements, int iters, hipStream_t stream) @@ -90,7 +90,16 @@ void simpleVectorAdd(size_t numElements, int iters, hipStream_t stream) // This is the null stream? //hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); - hipLaunchKernel(HipTest::vectorADDReverse, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); + hipLaunchKernel( + HipTest::vectorADDReverse, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(A_d), + static_cast(B_d), + C_d, + numElements); MemTraits::Copy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, stream); diff --git a/tests/src/runtimeApi/stream/hipNullStream.cpp b/tests/src/runtimeApi/stream/hipNullStream.cpp index b610315608..04a232f3bb 100644 --- a/tests/src/runtimeApi/stream/hipNullStream.cpp +++ b/tests/src/runtimeApi/stream/hipNullStream.cpp @@ -119,7 +119,7 @@ void Streamer::reset() { HipTest::setDefaultData(_numElements, _A_h, _B_h, _C_h); H2D(); - + } @@ -128,7 +128,17 @@ void Streamer::enqueAsync() { printf ("testing: %s numElements=%zu size=%6.2fMB\n", __func__, _numElements, _numElements * sizeof(T) / 1024.0/1024.0); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements); - hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _B_d, _C_d, _numElements, p_repeat); + hipLaunchKernel( + vectorADDRepeat, + dim3(blocks), + dim3(threadsPerBlock), + 0, + _stream, + static_cast(_A_d), + static_cast(_B_d), + _C_d, + _numElements, + p_repeat); } @@ -225,7 +235,17 @@ int main(int argc, char *argv[]) auto lastStreamer = streamers[s - 1]; // Dispatch to NULL stream, should wait for prior async activity to complete before beginning: - hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/); + hipLaunchKernel( + vectorADDRepeat, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0/*nullstream*/, + static_cast(lastStreamer->_C_d), + static_cast(lastStreamer->_C_d), + nullStreamer->_C_d, + numElements, + 1/*repeat*/); if (p_db) { @@ -238,7 +258,7 @@ int main(int argc, char *argv[]) nullStreamer->D2H(); HIPCHECK(hipDeviceSynchronize()); - HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); + HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); } } @@ -257,13 +277,23 @@ int main(int argc, char *argv[]) auto lastStreamer = streamers[s - 1]; // Dispatch to NULL stream, should wait for prior async activity to complete before beginning: - hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/); + hipLaunchKernel( + vectorADDRepeat, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0/*nullstream*/, + static_cast(lastStreamer->_C_d), + static_cast(lastStreamer->_C_d), + nullStreamer->_C_d, + numElements, + 1/*repeat*/); nullStreamer->D2H(); HIPCHECK(hipDeviceSynchronize()); - HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); + HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); } } @@ -289,10 +319,10 @@ int main(int argc, char *argv[]) // Copy with stream1, this could go async if the streamSync doesn't synchronize ALL the streams. HIPCHECK(hipMemcpyAsync(streamers[0]->_C_h, streamers[0]->_C_d, streamers[0]->_numElements*sizeof(int), hipMemcpyDeviceToHost, streamers[1]->_stream)); - + HIPCHECK(hipDeviceSynchronize()); - HipTest::checkTest(expected_H, streamers[0]->_C_h, numElements); + HipTest::checkTest(expected_H, streamers[0]->_C_h, numElements); } diff --git a/tests/src/runtimeApi/stream/hipStreamSync2.cpp b/tests/src/runtimeApi/stream/hipStreamSync2.cpp index c6a58ce7d4..962737774d 100644 --- a/tests/src/runtimeApi/stream/hipStreamSync2.cpp +++ b/tests/src/runtimeApi/stream/hipStreamSync2.cpp @@ -59,23 +59,23 @@ const char *syncModeString(int syncMode) { void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode syncMode, bool expectMismatch) { - // This test sends a long-running kernel to the null stream, then tests to see if the + // This test sends a long-running kernel to the null stream, then tests to see if the // specified synchronization technique is effective. // - // Some syncMode are not expected to correctly sync (for example "syncNone"). in these + // Some syncMode are not expected to correctly sync (for example "syncNone"). in these // cases the test sets expectMismatch and the check logic below will attempt to ensure that // the undesired synchronization did not occur - ie ensure the kernel is still running and did // not yet update the stop event. This can be tricky since if the kernel runs fast enough it - // may complete before the check. To prevent this, the addCountReverse has a count parameter - // which causes it to loop repeatedly, and the results are checked in reverse order. + // may complete before the check. To prevent this, the addCountReverse has a count parameter + // which causes it to loop repeatedly, and the results are checked in reverse order. // // Tests with expectMismatch=true should ensure the kernel finishes correctly. This results // are checked and we test to make sure stop event has completed. - + if (!(testMask & p_tests)) { return; } - printf ("\ntest 0x%02x: syncMode=%s expectMismatch=%d\n", + printf ("\ntest 0x%02x: syncMode=%s expectMismatch=%d\n", testMask, syncModeString(syncMode), expectMismatch); size_t sizeBytes = numElements * sizeof(int); @@ -97,8 +97,17 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode s unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); // Launch kernel into null stream, should result in C_h == count. - hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, 0 /*stream*/, C_d, C_h, numElements, count); - HIPCHECK(hipEventRecord(stop, 0/*default*/)); + hipLaunchKernelGGL( + HipTest::addCountReverse, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0 /*stream*/, + static_cast(C_d), + C_h, + numElements, + count); + HIPCHECK(hipEventRecord(stop, 0/*default*/)); switch (syncMode) { case syncNone: @@ -108,18 +117,18 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode s break; case syncOtherStream: // Does this synchronize with the null stream? - HIPCHECK(hipStreamSynchronize(otherStream)); + HIPCHECK(hipStreamSynchronize(otherStream)); break; case syncMarkerThenOtherStream: case syncMarkerThenOtherNonBlockingStream: - - // this may wait for NULL stream depending hipStreamNonBlocking flag above - HIPCHECK(hipEventRecord(otherStreamEvent, otherStream)); - HIPCHECK(hipStreamSynchronize(otherStream)); + // this may wait for NULL stream depending hipStreamNonBlocking flag above + HIPCHECK(hipEventRecord(otherStreamEvent, otherStream)); + + HIPCHECK(hipStreamSynchronize(otherStream)); break; case syncDevice: - HIPCHECK(hipDeviceSynchronize()); + HIPCHECK(hipDeviceSynchronize()); break; default: assert(0); @@ -197,7 +206,7 @@ void runTests(int64_t numElements) int main(int argc, char *argv[]) { // Can' destroy the default stream:// TODO - move to another test - HIPCHECK_API(hipStreamDestroy(0), hipErrorInvalidResourceHandle); + HIPCHECK_API(hipStreamDestroy(0), hipErrorInvalidResourceHandle); HipTest::parseStandardArguments(argc, argv, true /*failOnUndefinedArg*/); diff --git a/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp b/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp index f5b1b79550..a7a930b4f6 100644 --- a/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp +++ b/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp @@ -88,7 +88,7 @@ private: template Streamer::Streamer(int deviceId, T * A_d, size_t numElements, int commandType) : - _preA_d(NULL), + _preA_d(NULL), _A_d(A_d), _deviceId(deviceId), _numElements(numElements), @@ -163,9 +163,27 @@ void Streamer::runAsyncAfter(Streamer *depStreamer, bool waitSameStream) unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements); if (_commandType == COMMAND_ADD_REVERSE) { - hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); + hipLaunchKernelGGL( + HipTest::addCountReverse, + dim3(blocks), + dim3(threadsPerBlock), + 0, + _stream, + static_cast(_A_d), + _C_d, + static_cast(_numElements), + static_cast(p_count)); } else if (_commandType == COMMAND_ADD_FORWARD) { - hipLaunchKernelGGL(HipTest::addCount, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); + hipLaunchKernelGGL( + HipTest::addCount, + dim3(blocks), + dim3(threadsPerBlock), + 0, + _stream, + static_cast(_A_d), + _C_d, + _numElements, + static_cast(p_count)); } else if (_commandType == COMMAND_COPY) { HIPCHECK(hipMemcpyAsync(_C_d, _A_d, _numElements * sizeof(T), hipMemcpyDeviceToDevice, _stream)); } else { @@ -239,7 +257,7 @@ size_t Streamer::check(int streamerNum, T initValue, T expectedOffset, bool e return _mismatchCount; } - + //--- //Parse arguments specific to this test. @@ -300,7 +318,7 @@ void checkAll(int initValue, std::vector &streamers, std::vector< for (int i=0; iexpectedAdd(); - + mismatchCount += streamers[i]->check(i+1, initValue, expected, expectPass); } @@ -330,7 +348,7 @@ void checkAll(int initValue, std::vector &streamers, std::vector< void sync_none(void) {}; -void sync_allDevices(int numDevices) +void sync_allDevices(int numDevices) { for (int d=0; d streamers) +void sync_queryAllUntilComplete(std::vector streamers) { for (int i=streamers.size()-1; i>=0; i--) { streamers[i]->queryUntilComplete(); @@ -347,7 +365,7 @@ void sync_queryAllUntilComplete(std::vector streamers) } -void sync_streamWaitEvent(hipEvent_t lastEvent, int sideDeviceId, hipStream_t sideStream, bool waitHere) +void sync_streamWaitEvent(hipEvent_t lastEvent, int sideDeviceId, hipStream_t sideStream, bool waitHere) { HIPCHECK(hipSetDevice(sideDeviceId)); @@ -389,7 +407,7 @@ int main(int argc, char *argv[]) initArray_h[i] = initValue; } HIPCHECK(hipMemcpy(initArray_d, initArray_h, sizeElements, hipMemcpyHostToDevice)); - + int numDevices; HIPCHECK(hipGetDeviceCount(&numDevices)); @@ -414,7 +432,7 @@ int main(int argc, char *argv[]) // A sideband stream channel that is independent from above. - // Used to check to ensure the WaitEvent or other synchronization is working correctly since by default sideStream is + // Used to check to ensure the WaitEvent or other synchronization is working correctly since by default sideStream is // asynchronous wrt the other streams. std::vector sideStreams; for (int d=0; d Test 0x1000 simple null stream tests\n"); + printf ("==> Test 0x1000 simple null stream tests\n"); // try some null stream: hipStreamQuery(0); @@ -463,7 +481,7 @@ int main(int argc, char *argv[]) HIPCHECK(hipEventRecord(e1, s1)) HIPCHECK(hipStreamWaitEvent(hipStream_t(0), e1, 0/*flags*/)); - + HIPCHECK(hipStreamDestroy(s1)); HIPCHECK(hipEventDestroy(e1)); } @@ -476,11 +494,11 @@ int main(int argc, char *argv[]) HIPCHECK(hipEventRecord(e1, hipStream_t(0))) HIPCHECK(hipStreamWaitEvent(s1, e1, 0/*flags*/)); - + HIPCHECK(hipStreamDestroy(s1)); HIPCHECK(hipEventDestroy(e1)); } - + } diff --git a/tests/src/stress/hipStressKernel.cpp b/tests/src/stress/hipStressKernel.cpp index 7b5eec5a80..52d8fa1fe9 100644 --- a/tests/src/stress/hipStressKernel.cpp +++ b/tests/src/stress/hipStressKernel.cpp @@ -57,5 +57,8 @@ int main(){ } std::cout<