From c2482d1255ec6ea2f45aae872db001ce796bde1f Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 1 Nov 2017 15:09:59 +0000 Subject: [PATCH] This switches HIP from its currently convoluted macro + pfe based dispatch mechanism to a more natural one partially based on the existing module API. The basic idea is that HCC will always correctly emit __global__ functions: as empty-bodied stubs, on host, and as kernels, on device. It then becomes trivial to obtain the mangled name on host, at dispatch, from the function's address, and then to use the mangled name to retrieve the kernel. This should address all problems stemming from serialisation, dubious mismatches due to the manufactured functor, macro-isms et al. It also immediately enables support for generalised globals as a consequence of that being available in the module API. Finally, it will make debug much easier, since the actual names of the __global__ functions will automatically be used in traces etc. One detail is that due to how dispatch works now (hipLaunchKernel and hipLaunchKernelGGL are themselves variadic function templates which deduce the function type of the callee), in certain cases it may be necesssary to insert explicit casts to ensure that the variadic argument list selects a viable overload - this can be observed in some unit tests. Eventually we may be able to remove this limitation, but for now it does not appear terribly onerous. The code is not extremely HIPpie, nor is it fully optimised, but rather is intended as a starting point for the HIP team to make its own. --- include/hip/hcc_detail/code_object_bundle.hpp | 134 +++ include/hip/hcc_detail/grid_launch_GGL.hpp | 1059 ++--------------- include/hip/hcc_detail/hip_runtime.h | 4 +- include/hip/hcc_detail/host_defines.h | 3 +- include/hip/hcc_detail/program_state.hpp | 60 + src/code_object_bundle.cpp | 39 + src/grid_launch.cpp | 142 ++- src/hip_hcc_internal.h | 2 +- src/hip_memory.cpp | 20 +- src/hip_module.cpp | 197 +-- src/program_state.cpp | 498 ++++++++ tests/src/context/hipMemsetD8.cpp | 3 +- tests/src/deviceLib/hipTestDevice.cpp | 141 ++- tests/src/deviceLib/hipTestDeviceDouble.cpp | 124 +- tests/src/deviceLib/hip_test_ldg.cpp | 45 +- tests/src/experimental/xcompile/hipxxKer.cpp | 10 +- tests/src/kernel/hipLanguageExtensions.cpp | 4 +- tests/src/kernel/hipTestMemKernel.cpp | 45 +- tests/src/runtimeApi/event/hipEventRecord.cpp | 11 +- tests/src/runtimeApi/event/record_event.cpp | 31 +- tests/src/runtimeApi/memory/hipMemcpy.cpp | 77 +- .../runtimeApi/memory/hipMemcpy_simple.cpp | 11 +- .../multiThread/hipMultiThreadStreams1.cpp | 15 +- tests/src/runtimeApi/stream/hipNullStream.cpp | 46 +- .../src/runtimeApi/stream/hipStreamSync2.cpp | 39 +- .../runtimeApi/stream/hipStreamWaitEvent.cpp | 46 +- tests/src/stress/hipStressKernel.cpp | 3 + 27 files changed, 1457 insertions(+), 1352 deletions(-) create mode 100644 include/hip/hcc_detail/code_object_bundle.hpp create mode 100644 include/hip/hcc_detail/program_state.hpp create mode 100644 src/code_object_bundle.cpp create mode 100644 src/program_state.cpp diff --git a/include/hip/hcc_detail/code_object_bundle.hpp b/include/hip/hcc_detail/code_object_bundle.hpp new file mode 100644 index 0000000000..080132c561 --- /dev/null +++ b/include/hip/hcc_detail/code_object_bundle.hpp @@ -0,0 +1,134 @@ +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace hip_impl +{ + hsa_isa_t triple_to_hsa_isa(const std::string& triple); + + struct Bundled_code { + union { + struct { + std::uint64_t offset; + std::uint64_t bundle_sz; + std::uint64_t triple_sz; + }; + std::uint8_t cbuf[ + sizeof(offset) + sizeof(bundle_sz) + sizeof(triple_sz)]; + }; + std::string triple; + std::vector blob; + }; + + class Bundled_code_header { + // DATA - STATICS + static constexpr const char magic_string_[] = + "__CLANG_OFFLOAD_BUNDLE__"; + static constexpr auto magic_string_sz_ = sizeof(magic_string_) - 1; + + // DATA + union { + struct { + std::uint8_t bundler_magic_string_[magic_string_sz_]; + std::uint64_t bundle_cnt_; + }; + std::uint8_t cbuf_[ + sizeof(bundler_magic_string_) + sizeof(bundle_cnt_)]; + }; + std::vector bundles_; + + // FRIENDS - MANIPULATORS + template + friend + inline + bool read( + RandomAccessIterator f, + RandomAccessIterator l, + Bundled_code_header& x) + { + std::copy_n(f, sizeof(x.cbuf_), x.cbuf_); + + if (valid(x)) { + x.bundles_.resize(x.bundle_cnt_); + + auto it = f + sizeof(x.cbuf_); + for (auto&& y : x.bundles_) { + std::copy_n(it, sizeof(y.cbuf), y.cbuf); + it += sizeof(y.cbuf); + + y.triple.insert(y.triple.cend(), it, it + y.triple_sz); + + std::copy_n( + f + y.offset, y.bundle_sz, std::back_inserter(y.blob)); + + it += y.triple_sz; + } + + return true; + } + + return false; + } + friend + inline + bool read(const std::vector& blob, Bundled_code_header& x) + { + return read(blob.cbegin(), blob.cend(), x); + } + friend + inline + bool read(std::istream& is, Bundled_code_header& x) + { + return read(std::vector{ + std::istreambuf_iterator{is}, + std::istreambuf_iterator{}}, + x); + } + + // FRIENDS - ACCESSORS + friend + inline + bool valid(const Bundled_code_header& x) + { + return std::equal( + x.bundler_magic_string_, + x.bundler_magic_string_ + magic_string_sz_, + x.magic_string_); + } + friend + inline + const std::vector& bundles(const Bundled_code_header& x) + { + return x.bundles_; + } + public: + // CREATORS + Bundled_code_header() = default; + template + Bundled_code_header(RandomAccessIterator f, RandomAccessIterator l); + explicit + Bundled_code_header(const std::vector& blob); + Bundled_code_header(const Bundled_code_header&) = default; + Bundled_code_header(Bundled_code_header&&) = default; + ~Bundled_code_header() = default; + + // MANIPULATORS + Bundled_code_header& operator=(const Bundled_code_header&) = default; + Bundled_code_header& operator=(Bundled_code_header&&) = default; + }; + + // CREATORS + template + Bundled_code_header::Bundled_code_header(I f, I l) : Bundled_code_header{} + { + read(f, l, *this); + } +} // Namespace hip_impl. \ No newline at end of file diff --git a/include/hip/hcc_detail/grid_launch_GGL.hpp b/include/hip/hcc_detail/grid_launch_GGL.hpp index 4c632f9d68..e3fa3331ac 100644 --- a/include/hip/hcc_detail/grid_launch_GGL.hpp +++ b/include/hip/hcc_detail/grid_launch_GGL.hpp @@ -24,984 +24,139 @@ THE SOFTWARE. #if GENERIC_GRID_LAUNCH == 1 +#include "code_object_bundle.hpp" #include "concepts.hpp" #include "helpers.hpp" +#include "program_state.hpp" #include "hc.hpp" #include "hip/hip_hcc.h" #include "hip_runtime.h" +#include +#include #include #include +#include #include +#include +#include #include +#include #include +#include namespace hip_impl { - namespace + template< + typename T, + typename std::enable_if{}>::type* = nullptr> + inline + T round_up_to_next_multiple_nonnegative(T x, T y) { - struct New_grid_launch_tag {}; - struct Old_grid_launch_tag {}; - - template - class RAII_guard { - D dtor_; - public: - RAII_guard() = default; - - RAII_guard(const C& ctor, D dtor) : dtor_{std::move(dtor)} - { - ctor(); - } - - RAII_guard(const RAII_guard&) = default; - RAII_guard(RAII_guard&&) = default; - - RAII_guard& operator=(const RAII_guard&) = default; - RAII_guard& operator=(RAII_guard&&) = default; - - ~RAII_guard() { dtor_(); } - }; - - template - RAII_guard make_RAII_guard(const C& ctor, D dtor) - { - return RAII_guard{ctor, std::move(dtor)}; - } - - template - using is_new_grid_launch_t = typename std::conditional< - is_callable{}, - New_grid_launch_tag, - Old_grid_launch_tag>::type; + T tmp = x + y - 1; + return tmp - tmp % y; } - // TODO: - dispatch rank should be derived from the domain dimensions passed - // in, and not always assumed to be 3; - - template - requires(Domain == {Ts...}) inline - void grid_launch_hip_impl_( - New_grid_launch_tag, - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - const hc::accelerator_view& acc_v, - K k) + std::vector make_kernarg() { - const auto d = hc::extent<3>{ - num_blocks.z * dim_blocks.z, - num_blocks.y * dim_blocks.y, - num_blocks.x * dim_blocks.x}.tile_with_dynamic( - dim_blocks.z, - dim_blocks.y, - dim_blocks.x, - group_mem_bytes); - - try { - hc::parallel_for_each(acc_v, d, k); - } - catch (std::exception& ex) { - std::cerr << "Failed in " << __func__ << ", with exception: " - << ex.what() << std::endl; - throw; - } + return {}; } - // TODO: these are workarounds, they should be removed. - - hc::accelerator_view lock_stream_hip_(hipStream_t&, void*&); - void print_prelaunch_trace_(const char*, dim3, dim3, int, hipStream_t); - void unlock_stream_hip_( - hipStream_t, void*, const char*, hc::accelerator_view*); - - template - requires(Domain == {Ts...}) inline - void grid_launch_hip_impl_( - New_grid_launch_tag, - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, + std::vector make_kernarg(std::vector kernarg) + { + return kernarg; + } + + template + inline + std::vector make_kernarg(std::vector kernarg, T x) + { + kernarg.resize( + round_up_to_next_multiple_nonnegative(kernarg.size(), alignof(T)) + + sizeof(T)); + + new (kernarg.data() + kernarg.size() - sizeof(T)) T{std::move(x)}; + + return kernarg; + } + + template + inline + std::vector make_kernarg( + std::vector kernarg, T x, Ts... xs) + { + return make_kernarg( + make_kernarg(std::move(kernarg), std::move(x)), std::move(xs)...); + } + + template + inline + std::vector make_kernarg(Ts... xs) + { + std::vector kernarg; + kernarg.reserve(sizeof(std::tuple)); + + return make_kernarg(std::move(kernarg), std::move(xs)...); + } + + void hipLaunchKernelGGLImpl( + std::uintptr_t function_address, + const dim3& numBlocks, + const dim3& dimBlocks, + std::uint32_t sharedMemBytes, hipStream_t stream, - const char* kernel_name, - K k) - { - void* lck_stream = nullptr; - auto acc_v = lock_stream_hip_(stream, lck_stream); - auto stream_guard = make_RAII_guard( - std::bind( - print_prelaunch_trace_, - kernel_name, - num_blocks, - dim_blocks, - group_mem_bytes, - stream), - std::bind( - unlock_stream_hip_, stream, lck_stream, kernel_name, &acc_v)); + void** kernarg); +} // Namespace hip_impl. - try { - grid_launch_hip_impl_( - New_grid_launch_tag{}, - std::move(num_blocks), - std::move(dim_blocks), - group_mem_bytes, - acc_v, - std::move(k)); - } - catch (std::exception& ex) { - std::cerr << "Failed in " << __func__ << ", with exception: " - << ex.what() << std::endl; - throw; - } - } +template +inline +void hipLaunchKernelGGL( + F kernel, + const dim3& numBlocks, + const dim3& dimBlocks, + std::uint32_t sharedMemBytes, + hipStream_t stream, + Args... args) +{ + auto kernarg = hip_impl::make_kernarg(std::move(args)...); + std::size_t kernarg_size = kernarg.size(); - template - requires(Domain == {hipLaunchParm, Ts...}) - inline - void grid_launch_hip_impl_( - Old_grid_launch_tag, - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream, - K k) - { - grid_launch_hip_impl_( - New_grid_launch_tag{}, - std::move(num_blocks), - std::move(dim_blocks), - group_mem_bytes, - std::move(stream), - std::move(k)); - } + void* config[] = { + HIP_LAUNCH_PARAM_BUFFER_POINTER, kernarg.data(),//&kernarg, + HIP_LAUNCH_PARAM_BUFFER_SIZE, &kernarg_size, + HIP_LAUNCH_PARAM_END + }; - template - requires(Domain == {hipLaunchParm, Ts...}) - inline - void grid_launch_hip_impl_( - Old_grid_launch_tag, - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream, - const char* kernel_name, - K k) - { - grid_launch_hip_impl_( - New_grid_launch_tag{}, - std::move(num_blocks), - std::move(dim_blocks), - group_mem_bytes, - std::move(stream), - kernel_name, - std::move(k)); - } - - template - requires(Domain == {Ts...}) - inline - std::enable_if_t::value> grid_launch_hip_( - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream, - const char* kernel_name, - K k) - { - grid_launch_hip_impl_( - is_new_grid_launch_t{}, - std::move(num_blocks), - std::move(dim_blocks), - group_mem_bytes, - std::move(stream), - kernel_name, - std::move(k)); - } - - template - requires(Domain == {Ts...}) - inline - std::enable_if_t::value> grid_launch_hip_( - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream, - K k) - { - grid_launch_hip_impl_( - is_new_grid_launch_t{}, - std::move(num_blocks), - std::move(dim_blocks), - group_mem_bytes, - std::move(stream), - std::move(k)); - } - - // TODO: these are temporary and purposefully noisy and disruptive. - #define make_kernel_name_hip(k, n)\ - HIP_kernel_functor_name_begin ## _ ## k ## _ ## \ - HIP_kernel_functor_name_end ## _ ## n - - #define make_kernel_functor_hip_30(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ - p24, p25, p26, p27)\ - struct make_kernel_name_hip(function_name, 28) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - std::decay_t _p21_;\ - std::decay_t _p22_;\ - std::decay_t _p23_;\ - std::decay_t _p24_;\ - std::decay_t _p25_;\ - std::decay_t _p26_;\ - std::decay_t _p27_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_,\ - _p26_, _p27_);\ - }\ - } - #define make_kernel_functor_hip_29(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ - p24, p25, p26)\ - struct make_kernel_name_hip(function_name, 27) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - std::decay_t _p21_;\ - std::decay_t _p22_;\ - std::decay_t _p23_;\ - std::decay_t _p24_;\ - std::decay_t _p25_;\ - std::decay_t _p26_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_,\ - _p26_);\ - }\ - } - #define make_kernel_functor_hip_28(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ - p24, p25)\ - struct make_kernel_name_hip(function_name, 26) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - std::decay_t _p21_;\ - std::decay_t _p22_;\ - std::decay_t _p23_;\ - std::decay_t _p24_;\ - std::decay_t _p25_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_);\ - }\ - } - #define make_kernel_functor_hip_27(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ - p24)\ - struct make_kernel_name_hip(function_name, 25) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - std::decay_t _p21_;\ - std::decay_t _p22_;\ - std::decay_t _p23_;\ - std::decay_t _p24_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_);\ - }\ - } - #define make_kernel_functor_hip_26(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23)\ - struct make_kernel_name_hip(function_name, 24) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - std::decay_t _p21_;\ - std::decay_t _p22_;\ - std::decay_t _p23_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_, _p23_);\ - }\ - } - #define make_kernel_functor_hip_25(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22)\ - struct make_kernel_name_hip(function_name, 23) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - std::decay_t _p21_;\ - std::decay_t _p22_;\ - __attribute__((used, flatten))\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_);\ - }\ - } - #define make_kernel_functor_hip_24(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21)\ - struct make_kernel_name_hip(function_name, 22) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - std::decay_t _p21_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_);\ - }\ - } - #define make_kernel_functor_hip_23(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20)\ - struct make_kernel_name_hip(function_name, 21) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_);\ - }\ - } - #define make_kernel_functor_hip_22(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19)\ - struct make_kernel_name_hip(function_name, 20) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_);\ - }\ - } - #define make_kernel_functor_hip_21(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18)\ - struct make_kernel_name_hip(function_name, 19) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_);\ - }\ - } - #define make_kernel_functor_hip_20(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17)\ - struct make_kernel_name_hip(function_name, 18) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_);\ - }\ - } - #define make_kernel_functor_hip_19(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16)\ - struct make_kernel_name_hip(function_name, 17) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_);\ - }\ - } - #define make_kernel_functor_hip_18(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15)\ - struct make_kernel_name_hip(function_name, 16) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_);\ - }\ - } - #define make_kernel_functor_hip_17(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14)\ - struct make_kernel_name_hip(function_name, 15) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_);\ - }\ - } - #define make_kernel_functor_hip_16(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13)\ - struct make_kernel_name_hip(function_name, 14) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_);\ - }\ - } - #define make_kernel_functor_hip_15(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12)\ - struct make_kernel_name_hip(function_name, 13) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_);\ - }\ - } - #define make_kernel_functor_hip_14(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11)\ - struct make_kernel_name_hip(function_name, 12) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_);\ - }\ - } - #define make_kernel_functor_hip_13(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10)\ - struct make_kernel_name_hip(function_name, 11) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_);\ - }\ - } - #define make_kernel_functor_hip_12(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9)\ - struct make_kernel_name_hip(function_name, 10) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_);\ - }\ - } - #define make_kernel_functor_hip_11(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8)\ - struct make_kernel_name_hip(function_name, 9) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_);\ - }\ - } - #define make_kernel_functor_hip_10(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7)\ - struct make_kernel_name_hip(function_name, 8) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_);\ - }\ - } - #define make_kernel_functor_hip_9(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6)\ - struct make_kernel_name_hip(function_name, 7) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_);\ - }\ - } - #define make_kernel_functor_hip_8(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5)\ - struct make_kernel_name_hip(function_name, 6) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_);\ - }\ - } - #define make_kernel_functor_hip_7(\ - function_name, kernel_name, p0, p1, p2, p3, p4)\ - struct make_kernel_name_hip(function_name, 5) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_);\ - }\ - } - #define make_kernel_functor_hip_6(function_name, kernel_name, p0, p1, p2, p3)\ - struct make_kernel_name_hip(function_name, 4) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_, _p1_, _p2_, _p3_);\ - }\ - } - #define make_kernel_functor_hip_5(function_name, kernel_name, p0, p1, p2)\ - struct make_kernel_name_hip(function_name, 3) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_, _p1_, _p2_);\ - }\ - } - #define make_kernel_functor_hip_4(function_name, kernel_name, p0, p1)\ - struct make_kernel_name_hip(function_name, 2) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_, _p1_);\ - }\ - } - #define fofo(f, n) kernel_prefix_hip ## f ## kernel_suffix_hip ## n - #define make_kernel_functor_hip_3(function_name, kernel_name, p0)\ - struct make_kernel_name_hip(function_name, 1) {\ - std::decay_t _p0_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_);\ - }\ - } - #define make_kernel_functor_hip_2(function_name, kernel_name)\ - struct make_kernel_name_hip(function_name, 0) {\ - void operator()(const hc::tiled_index<3>&) [[hc]]\ - {\ - return kernel_name(hipLaunchParm{});\ - }\ - } - #define make_kernel_functor_hip_1(...) - #define make_kernel_functor_hip_0(...) - #define make_kernel_functor_hip_(...)\ - overload_macro_hip_(make_kernel_functor_hip_, __VA_ARGS__) - - - #define hipLaunchNamedKernelGGL(\ - function_name,\ - kernel_name,\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - ...)\ - do {\ - make_kernel_functor_hip_(function_name, kernel_name, __VA_ARGS__)\ - hip_kernel_functor_impl_{__VA_ARGS__};\ - hip_impl::grid_launch_hip_(\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - #kernel_name,\ - hip_kernel_functor_impl_);\ - } while(0) - - #define hipLaunchKernelGGL(\ - kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)\ - do {\ - hipLaunchNamedKernelGGL(\ - unnamed,\ - kernel_name,\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - ##__VA_ARGS__);\ - } while (0) - - #define hipLaunchKernel(\ - kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)\ - do {\ - hipLaunchKernelGGL(\ - kernel_name,\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - hipLaunchParm{},\ - ##__VA_ARGS__);\ - } while(0) + hip_impl::hipLaunchKernelGGLImpl( + reinterpret_cast(kernel), + numBlocks, + dimBlocks, + sharedMemBytes, + stream, + &config[0]); } + +template +inline +void hipLaunchKernel( + F kernel, + const dim3& numBlocks, + const dim3& dimBlocks, + std::uint32_t groupMemBytes, + hipStream_t stream, + Args... args) +{ + hipLaunchKernelGGL( + kernel, + numBlocks, + dimBlocks, + groupMemBytes, + stream, + hipLaunchParm{}, + std::move(args)...); +} + #endif //GENERIC_GRID_LAUNCH diff --git a/include/hip/hcc_detail/hip_runtime.h b/include/hip/hcc_detail/hip_runtime.h index 370ac2abbb..d3211ed3f5 100644 --- a/include/hip/hcc_detail/hip_runtime.h +++ b/include/hip/hcc_detail/hip_runtime.h @@ -53,7 +53,7 @@ THE SOFTWARE. // define HIP_ENABLE_PRINTF to enable printf #ifdef HIP_ENABLE_PRINTF #define HCC_ENABLE_ACCELERATOR_PRINTF 1 -#endif +#endif //--- // Remainder of this file only compiles with HCC @@ -481,7 +481,7 @@ do {\ type* var = \ (type*)__get_dynamicgroupbaseptr(); \ -#define HIP_DYNAMIC_SHARED_ATTRIBUTE +#define HIP_DYNAMIC_SHARED_ATTRIBUTE diff --git a/include/hip/hcc_detail/host_defines.h b/include/hip/hcc_detail/host_defines.h index b2e7ac2617..56cfa0cc0f 100644 --- a/include/hip/hcc_detail/host_defines.h +++ b/include/hip/hcc_detail/host_defines.h @@ -44,7 +44,8 @@ THE SOFTWARE. #if GENERIC_GRID_LAUNCH == 0 #define __global__ __attribute__((hc_grid_launch)) __attribute__((used)) #else -#define __global__ __attribute__((annotate("hip__global__"), hc, used, weak)) +#define __global__ \ + __attribute__((annotate("__HIP_global_function__"), cpu, hc, used)) #endif //GENERIC_GRID_LAUNCH #define __noinline__ __attribute__((noinline)) diff --git a/include/hip/hcc_detail/program_state.hpp b/include/hip/hcc_detail/program_state.hpp new file mode 100644 index 0000000000..03701725eb --- /dev/null +++ b/include/hip/hcc_detail/program_state.hpp @@ -0,0 +1,60 @@ +/* +Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include + +struct ihipModuleSymbol_t; +using hipFunction_t = ihipModuleSymbol_t*; + +namespace hip_impl +{ + struct Kernel_descriptor { + std::uint64_t kernel_object_; + std::uint32_t group_size_; + std::uint32_t private_size_; + std::string name_; + + operator hipFunction_t() const + { // TODO: this is awful and only meant for illustration. + return reinterpret_cast( + const_cast(this)); + } + }; + + const std::unordered_map< + std::uintptr_t, + std::vector>>& functions(); + const std::unordered_map& function_names(); + + hsa_executable_t load_executable( + hsa_executable_t executable, hsa_agent_t agent, std::istream& file); +} // Namespace hip_impl. \ No newline at end of file diff --git a/src/code_object_bundle.cpp b/src/code_object_bundle.cpp new file mode 100644 index 0000000000..d7d2cd1e10 --- /dev/null +++ b/src/code_object_bundle.cpp @@ -0,0 +1,39 @@ +#include "../include/hip/hcc_detail/code_object_bundle.hpp" + +#include + +#include +#include +#include + +hsa_isa_t hip_impl::triple_to_hsa_isa(const std::string& triple) +{ + static constexpr const char prefix[] = "hcc-amdgcn--amdhsa-gfx"; + static constexpr std::size_t prefix_sz = sizeof(prefix) - 1; + + hsa_isa_t r = {}; + + auto idx = triple.find(prefix); + + if (idx != std::string::npos) { + idx += prefix_sz; + std::string tmp = "AMD:AMDGPU"; + while (idx != triple.size()) { + tmp.push_back(':'); + tmp.push_back(triple[idx++]); + } + + hsa_isa_from_name(tmp.c_str(), &r); + } + + return r; +} + +// DATA - STATICS +constexpr const char hip_impl::Bundled_code_header::magic_string_[]; + +// CREATORS +hip_impl::Bundled_code_header::Bundled_code_header( + const std::vector& x) + : Bundled_code_header{x.cbegin(), x.cend()} +{} \ No newline at end of file diff --git a/src/grid_launch.cpp b/src/grid_launch.cpp index fd5c2a1573..4a26f66c8c 100644 --- a/src/grid_launch.cpp +++ b/src/grid_launch.cpp @@ -21,76 +21,118 @@ THE SOFTWARE. */ #include "hip/hcc_detail/grid_launch_GGL.hpp" +#include "hip/hcc_detail/program_state.hpp" + +#include "hip/hip_runtime_api.h" // Internal header, do not percolate upwards. #include "hip_hcc_internal.h" #include "hc.hpp" #include "trace_helper.h" +#include +#include +#include +#include + #include -#include + +using namespace hc; +using namespace std; namespace hip_impl { - hc::accelerator_view lock_stream_hip_( - hipStream_t& stream, void*& locked_stream) - { // This allocated but does not take ownership of locked_stream. If it is - // not deleted elsewhere it will leak. - using L = decltype(stream->lockopen_preKernelCommand()); - - HIP_INIT(); - - stream = ihipSyncAndResolveStream(stream); - locked_stream = new L{stream->lockopen_preKernelCommand()}; - return (*static_cast(locked_stream))->_av; - } - - void print_prelaunch_trace_( - const char* kernel_name, - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream) + namespace { - if ((HIP_TRACE_API & (1 << TRACE_KCMD)) || - HIP_PROFILE_API || - (COMPILE_HIP_DB && (HIP_TRACE_API & (1<second; + } + + inline + string name(hsa_agent_t agent) + { + char n[64] = {}; + hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, n); + + return string{n}; + } + + inline + hsa_agent_t target_agent(hipStream_t stream) + { + if (stream) { + return *static_cast( + stream->locked_getAv()->get_hsa_agent()); + } + else if ( + ihipGetTlsDefaultCtx() && ihipGetTlsDefaultCtx()->getDevice()) { + return ihipGetDevice( + ihipGetTlsDefaultCtx()->getDevice()->_deviceId)->_hsaAgent; + } + else { + return *static_cast( + accelerator{}.get_default_view().get_hsa_agent()); } } } - void unlock_stream_hip_( + void hipLaunchKernelGGLImpl( + uintptr_t function_address, + const dim3& numBlocks, + const dim3& dimBlocks, + uint32_t sharedMemBytes, hipStream_t stream, - void* locked_stream, - const char* kernel_name, - hc::accelerator_view* acc_v) - { // Precondition: acc_v is the accelerator_view associated with stream - // which is guarded by locked_stream; - // locked_stream is deletable. - using L = decltype(stream->lockopen_preKernelCommand()); + void** kernarg) + { + const auto it0 = functions().find(function_address); - stream->lockclose_postKernelCommand(kernel_name, acc_v); + if (it0 == functions().cend()) { + throw runtime_error{ + "No device code available for function: " + + name(function_address) + }; + } - delete static_cast(locked_stream); - locked_stream = nullptr; + auto agent = target_agent(stream); + + const auto it1 = find_if( + it0->second.cbegin(), + it0->second.cend(), + [=](const pair& x) { + return x.first.handle == agent.handle; + }); + + if (it1 == it0->second.cend()) { + throw runtime_error{ + "No code available for function: " + name(function_address) + + ", for agent: " + name(agent) + }; + } + + for (auto&& agent_kernel : it0->second) { + if (agent.handle == agent_kernel.first.handle) { + hipModuleLaunchKernel( + agent_kernel.second, + numBlocks.x, + numBlocks.y, + numBlocks.z, + dimBlocks.x, + dimBlocks.y, + dimBlocks.z, + sharedMemBytes, + stream, + nullptr, + kernarg); + } + } } } diff --git a/src/hip_hcc_internal.h b/src/hip_hcc_internal.h index 197cd35bfa..503bebcd6a 100644 --- a/src/hip_hcc_internal.h +++ b/src/hip_hcc_internal.h @@ -233,7 +233,7 @@ static const DbName dbName [] = #if COMPILE_HIP_DB #define tprintf(trace_level, ...) {\ if (HIP_DB & (1<<(trace_level))) {\ - char msgStr[1000];\ + char msgStr[2000];\ snprintf(msgStr, 2000, __VA_ARGS__);\ fprintf (stderr, " %ship-%s tid:%d:%s%s", dbName[trace_level]._color, dbName[trace_level]._shortName, tls_tidInfo.tid(), msgStr, KNRM); \ }\ diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index a8324c5729..96fc25c27d 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -65,7 +65,7 @@ int sharePtr(void *ptr, ihipCtx_t *ctx, bool shareWithAll, unsigned hipFlags) if (shareWithAll) { hsa_status_t s = hsa_amd_agents_allow_access(g_deviceCnt+1, g_allAgents, NULL, ptr); - tprintf (DB_MEM, " allow access to CPU + all %d GPUs (shareWithAll)\n", g_deviceCnt); + tprintf (DB_MEM, " allow access to CPU + all %d GPUs (shareWithAll)\n", g_deviceCnt); if (s != HSA_STATUS_SUCCESS) { ret = -1; } @@ -122,7 +122,7 @@ void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, bool if (HIP_INIT_ALLOC != -1) { // TODO , dont' call HIP API directly here: hipMemset(ptr, HIP_INIT_ALLOC, sizeBytes); - } + } if (ptr != nullptr) { int r = sharePtr(ptr, ctx, shareWithAll, hipFlags); @@ -251,7 +251,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) hip_status = hipErrorMemoryAllocation; } - } + } return ihipLogStatus(hip_status); @@ -284,10 +284,10 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) } - const unsigned supportedFlags = hipHostMallocPortable - | hipHostMallocMapped - | hipHostMallocWriteCombined - | hipHostMallocCoherent + const unsigned supportedFlags = hipHostMallocPortable + | hipHostMallocMapped + | hipHostMallocWriteCombined + | hipHostMallocCoherent | hipHostMallocNonCoherent; @@ -300,7 +300,7 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) hip_status = hipErrorInvalidValue; } else { auto device = ctx->getWriteableDevice(); - + unsigned amFlags = 0; if (flags & hipHostMallocCoherent) { amFlags = amHostCoherent; @@ -581,7 +581,7 @@ hipError_t hipMalloc3DArray(hipArray_t *array, hsa_ext_image_data_info_t imageInfo; hsa_status_t status = hsa_ext_image_data_get_info(*agent, &imageDescriptor, permission, &imageInfo); size_t alignment = imageInfo.alignment <= allocGranularity ? 0 : imageInfo.alignment; - + *ptr = hip_internal::allocAndSharePtr("device_array", allocSize, ctx, false, am_flags, 0, alignment); if (size && (*ptr == NULL)) { @@ -1585,7 +1585,7 @@ hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* devPtr){ HIP_INIT_API ( handle, devPtr); hipError_t hipStatus = hipSuccess; // Get the size of allocated pointer - size_t psize; + size_t psize = 0u; hc::accelerator acc; if((handle == NULL) || (devPtr == NULL)) { hipStatus = hipErrorInvalidResourceHandle; diff --git a/src/hip_module.cpp b/src/hip_module.cpp index e9e572af9b..a77ee48a53 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -119,15 +119,18 @@ namespace hipdrv { uint64_t PrintSymbolSizes(const void *emi, const char *name){ using namespace ELFIO; - const Elf64_Ehdr *ehdr = (const Elf64_Ehdr*)emi; + const ELFIO::Elf64_Ehdr *ehdr = (const ELFIO::Elf64_Ehdr*)emi; if(NULL == ehdr || EV_CURRENT != ehdr->e_version){} - const Elf64_Shdr * shdr = (const Elf64_Shdr*)((char*)emi + ehdr->e_shoff); + const ELFIO::Elf64_Shdr * shdr = + (const ELFIO::Elf64_Shdr*)((char*)emi + ehdr->e_shoff); for(uint16_t i=0;ie_shnum;++i){ if(shdr[i].sh_type == SHT_SYMTAB){ - const Elf64_Sym *syms = (const Elf64_Sym*)((char*)emi + shdr[i].sh_offset); + const ELFIO::Elf64_Sym *syms = + (const ELFIO::Elf64_Sym*)((char*)emi + shdr[i].sh_offset); assert(syms); uint64_t numSyms = shdr[i].sh_size/shdr[i].sh_entsize; - const char* strtab = (const char*)((char*)emi + shdr[shdr[i].sh_link].sh_offset); + const char* strtab = + (const char*)((char*)emi + shdr[shdr[i].sh_link].sh_offset); assert(strtab); for(uint64_t i=0;ie_shoff); + const ELFIO::Elf64_Ehdr *ehdr = (const ELFIO::Elf64_Ehdr*)emi; + const ELFIO::Elf64_Shdr *shdr = (const ELFIO::Elf64_Shdr*)((char*)emi + ehdr->e_shoff); uint64_t max_offset = ehdr->e_shoff; uint64_t total_size = max_offset + ehdr->e_shentsize * ehdr->e_shnum; @@ -164,156 +167,8 @@ uint64_t ElfSize(const void *emi){ return total_size; } -namespace -{ - template - inline - ELFIO::section* find_section_if(ELFIO::elfio& reader, P p) - { - using namespace std; - - const auto it = find_if( - reader.sections.begin(), reader.sections.end(), move(p)); - - return it != reader.sections.end() ? *it : nullptr; - } - - inline - std::vector copy_names_of_undefined_symbols( - const ELFIO::symbol_section_accessor& section) - { - using namespace ELFIO; - using namespace std; - - vector r; - - for (auto i = 0u; i != section.get_symbols_num(); ++i) { - // TODO: this is boyscout code, caching the temporaries - // may be of worth. - string name; - Elf64_Addr value = 0; - Elf_Xword size = 0; - Elf_Half sect_idx = 0; - uint8_t bind = 0; - uint8_t type = 0; - uint8_t other = 0; - - section.get_symbol( - i, name, value, size, bind, type, sect_idx, other); - - if (sect_idx == SHN_UNDEF && !name.empty()) { - r.push_back(std::move(name)); - } - } - - return r; - } - - inline - std::pair find_symbol_address( - const ELFIO::symbol_section_accessor& section, - const std::string& symbol_name) - { - using namespace ELFIO; - using namespace std; - - static constexpr pair r{0, 0}; - - for (auto i = 0u; i != section.get_symbols_num(); ++i) { - // TODO: this is boyscout code, caching the temporaries - // may be of worth. - string name; - Elf64_Addr value = 0; - Elf_Xword size = 0; - Elf_Half sect_idx = 0; - uint8_t bind = 0; - uint8_t type = 0; - uint8_t other = 0; - - section.get_symbol( - i, name, value, size, bind, type, sect_idx, other); - - if (name == symbol_name) return make_pair(value, size); - } - - return r; - } - - inline - void associate_code_object_symbols_with_host_allocation( - const ELFIO::elfio& reader, - const ELFIO::elfio& self_reader, - ELFIO::section* code_object_dynsym, - ELFIO::section* process_symtab, - hsa_agent_t agent, - hsa_executable_t executable) - { - using namespace ELFIO; - using namespace std; - - if (!code_object_dynsym || !process_symtab) return; - - const auto undefined_symbols = copy_names_of_undefined_symbols( - symbol_section_accessor{reader, code_object_dynsym}); - - for (auto&& x : undefined_symbols) { - const auto tmp = find_symbol_address( - symbol_section_accessor{self_reader, process_symtab}, x); - - assert(tmp.first); - - void* p = nullptr; - hsa_amd_memory_lock( - reinterpret_cast(tmp.first), tmp.second, &agent, 1, &p); - - hsa_executable_agent_global_variable_define( - executable, agent, x.c_str(), p); - - static vector< - unique_ptr> globals; - static mutex mtx; - - lock_guard lck{mtx}; - globals.emplace_back(p, hsa_amd_memory_unlock); - } - } - - inline - void load_code_object_and_freeze_executable( - const char* file, hsa_agent_t agent, hsa_executable_t executable) - { // TODO: the following sequence is inefficient, should be refactored - // into a single load of the file and subsequent ELFIO - // processing. - using namespace std; - - static const auto cor_deleter = [](hsa_code_object_reader_t* p) { - hsa_code_object_reader_destroy(*p); - }; - - using RAII_code_reader = unique_ptr< - hsa_code_object_reader_t, decltype(cor_deleter)>; - - unique_ptr cobj{fopen(file, "r"), fclose}; - RAII_code_reader tmp{new hsa_code_object_reader_t, cor_deleter}; - hsa_code_object_reader_create_from_file(fileno(cobj.get()), tmp.get()); - - hsa_executable_load_agent_code_object( - executable, agent, *tmp, nullptr, nullptr); - - hsa_executable_freeze(executable, nullptr); - - static vector code_readers; - static mutex mtx; - - lock_guard lck{mtx}; - code_readers.push_back(move(tmp)); - } -} - hipError_t hipModuleLoad(hipModule_t *module, const char *fname) { - using namespace ELFIO; - HIP_INIT_API(module, fname); hipError_t ret = hipSuccess; *module = new ihipModule_t; @@ -336,36 +191,14 @@ hipError_t hipModuleLoad(hipModule_t *module, const char *fname) nullptr, &(*module)->executable); - elfio reader; - if (!reader.load(fname)) { + std::ifstream file{fname}; + + if (!file.is_open()) { return ihipLogStatus(hipErrorFileNotFound); } - else { - // TODO: this may benefit from caching as well. - elfio self_reader; - self_reader.load("/proc/self/exe"); - - const auto symtab = - find_section_if(self_reader, [](const ELFIO::section* x) { - return x->get_type() == SHT_SYMTAB; - }); - - const auto code_object_dynsym = - find_section_if(reader, [](const ELFIO::section* x) { - return x->get_type() == SHT_DYNSYM; - }); - - associate_code_object_symbols_with_host_allocation( - reader, - self_reader, - code_object_dynsym, - symtab, - currentDevice->_hsaAgent, - (*module)->executable); - - load_code_object_and_freeze_executable( - fname, currentDevice->_hsaAgent, (*module)->executable); - } + (*module)->executable = hip_impl::load_executable( + (*module)->executable, currentDevice->_hsaAgent, file); + ret = (*module)->executable.handle ? hipSuccess : hipErrorUnknown; } return ihipLogStatus(ret); diff --git a/src/program_state.cpp b/src/program_state.cpp new file mode 100644 index 0000000000..be871a6e84 --- /dev/null +++ b/src/program_state.cpp @@ -0,0 +1,498 @@ +#include "../include/hip/hcc_detail/program_state.hpp" + +#include "../include/hip/hcc_detail/code_object_bundle.hpp" + +#include "hip_hcc_internal.h" +#include "trace_helper.h" + +#include "elfio/elfio.hpp" + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace ELFIO; +using namespace hip_impl; +using namespace std; + +namespace std +{ + template<> + struct hash { + size_t operator()(hsa_agent_t x) const + { + return hash{}(x.handle); + } + }; + + template<> + struct hash { + size_t operator()(hsa_isa_t x) const + { + return hash{}(x.handle); + } + }; +} + +inline +constexpr +bool operator==(hsa_agent_t x, hsa_agent_t y) +{ + return x.handle == y.handle; +} + +inline +constexpr +bool operator==(hsa_isa_t x, hsa_isa_t y) +{ + return x.handle == y.handle; +} + +namespace +{ + vector copy_names_of_undefined_symbols( + const symbol_section_accessor& section) + { + vector r; + + for (auto i = 0u; i != section.get_symbols_num(); ++i) { + // TODO: this is boyscout code, caching the temporaries + // may be of worth. + string name; + Elf64_Addr value = 0; + Elf_Xword size = 0; + Elf_Half sect_idx = 0; + uint8_t bind = 0; + uint8_t type = 0; + uint8_t other = 0; + + section.get_symbol( + i, name, value, size, bind, type, sect_idx, other); + + if (sect_idx == SHN_UNDEF && !name.empty()) { + r.push_back(std::move(name)); + } + } + + return r; + } + + pair find_symbol_address( + const symbol_section_accessor& section, + const string& symbol_name) + { + static constexpr pair r{0, 0}; + + for (auto i = 0u; i != section.get_symbols_num(); ++i) { + // TODO: this is boyscout code, caching the temporaries + // may be of worth. + string name; + Elf64_Addr value = 0; + Elf_Xword size = 0; + Elf_Half sect_idx = 0; + uint8_t bind = 0; + uint8_t type = 0; + uint8_t other = 0; + + section.get_symbol( + i, name, value, size, bind, type, sect_idx, other); + + if (name == symbol_name) return make_pair(value, size); + } + + return r; + } + + void associate_code_object_symbols_with_host_allocation( + const elfio& reader, + const elfio& self_reader, + section* code_object_dynsym, + section* process_symtab, + hsa_agent_t agent, + hsa_executable_t executable) + { + if (!code_object_dynsym || !process_symtab) return; + + const auto undefined_symbols = copy_names_of_undefined_symbols( + symbol_section_accessor{reader, code_object_dynsym}); + + for (auto&& x : undefined_symbols) { + const auto tmp = find_symbol_address( + symbol_section_accessor{self_reader, process_symtab}, x); + + assert(tmp.first); + + void* p = nullptr; + hsa_amd_memory_lock( + reinterpret_cast(tmp.first), tmp.second, &agent, 1, &p); + + hsa_executable_agent_global_variable_define( + executable, agent, x.c_str(), p); + + static vector< + unique_ptr> globals; + static mutex mtx; + + lock_guard lck{mtx}; + globals.emplace_back(p, hsa_amd_memory_unlock); + } + } + + template + inline + section* find_section_if(elfio& reader, P p) + { + const auto it = find_if( + reader.sections.begin(), reader.sections.end(), std::move(p)); + + return it != reader.sections.end() ? *it : nullptr; + } + + vector code_object_blob_for_process() + { + static constexpr const char self[] = "/proc/self/exe"; + static constexpr const char kernel_section[] = ".kernel"; + + elfio reader; + + if (!reader.load(self)) { + throw runtime_error{"Failed to load ELF file for current process."}; + } + + auto kernels = find_section_if(reader, [](const section* x) { + return x->get_name() == kernel_section; + }); + + vector r; + if (kernels) { + r.insert( + r.end(), + kernels->get_data(), + kernels->get_data() + kernels->get_size()); + } + + return r; + } + + const unordered_map>>& code_object_blobs() + { + static unordered_map>> r; + static once_flag f; + + call_once(f, []() { + static vector> blobs{ + code_object_blob_for_process()}; + + dl_iterate_phdr([](dl_phdr_info* i, std::size_t, void*) { + elfio tmp; + if (tmp.load(i->dlpi_name)) { + const auto it = find_section_if(tmp, [](const section* x) { + return x->get_name() == ".kernel"; + }); + + if (it) blobs.emplace_back( + it->get_data(), it->get_data() + it->get_size()); + } + return 0; + }, nullptr); + + for (auto&& blob : blobs) { + Bundled_code_header tmp{blob}; + if (valid(tmp)) { + for (auto&& bundle : bundles(tmp)) { + r[triple_to_hsa_isa(bundle.triple)] + .push_back(bundle.blob); + } + } + } + }); + + return r; + } + + const unordered_map>& executables() + { + static unordered_map> r; + static once_flag f; + + call_once(f, []() { + static const auto accelerators = hc::accelerator::get_all(); + + for (auto&& acc : accelerators) { + auto agent = static_cast(acc.get_hsa_agent()); + + if (!agent) continue; + + hsa_agent_iterate_isas(*agent, [](hsa_isa_t x, void* pa) { + const auto it = code_object_blobs().find(x); + + if (it != code_object_blobs().cend()) { + hsa_agent_t a = *static_cast(pa); + + for (auto&& blob : it->second) { + hsa_executable_t tmp = {}; + + hsa_executable_create_alt( + HSA_PROFILE_FULL, + HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, + nullptr, + &tmp); + + // TODO: this is massively inefficient and only + // meant for illustration. + string blob_to_str{blob.cbegin(), blob.cend()}; + stringstream istr{blob_to_str}; + tmp = load_executable(tmp, a, istr); + + if (tmp.handle) r[a].push_back(tmp); + } + } + + return HSA_STATUS_SUCCESS; + }, agent); + } + }); + + cout << r.size() << endl; + return r; + } + + inline + hsa_agent_t agent(hsa_executable_symbol_t x) + { + hsa_agent_t r = {}; + hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_AGENT, &r); + + return r; + } + + inline + uint32_t group_size(hsa_executable_symbol_t x) + { + uint32_t r = 0u; + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &r); + + return r; + } + + inline + uint64_t kernel_object(hsa_executable_symbol_t x) + { + uint64_t r = 0u; + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &r); + + return r; + } + + inline + string name(hsa_executable_symbol_t x) + { + uint32_t sz = 0u; + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &sz); + + string r(sz, '\0'); + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_NAME, &r.front()); + + return r; + } + + inline + uint32_t private_size(hsa_executable_symbol_t x) + { + uint32_t r = 0u; + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &r); + + return r; + } + + inline + hsa_symbol_kind_t type(hsa_executable_symbol_t x) + { + hsa_symbol_kind_t r = {}; + hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &r); + + return r; + } + + const unordered_map>& kernels() + { + static unordered_map> r; + static once_flag f; + + call_once(f, []() { + static const auto copy_kernels = []( + hsa_executable_t, hsa_agent_t, hsa_executable_symbol_t s, void*) { + if (type(s) == HSA_SYMBOL_KIND_KERNEL) r[name(s)].push_back(s); + + return HSA_STATUS_SUCCESS; + }; + + for (auto&& agent_executables : executables()) { + for (auto&& executable : agent_executables.second) { + hsa_executable_iterate_agent_symbols( + executable, + agent_executables.first, + copy_kernels, + nullptr); + } + } + }); + + return r; + } + + void load_code_object_and_freeze_executable( + istream& file, hsa_agent_t agent, hsa_executable_t executable) + { // TODO: the following sequence is inefficient, should be refactored + // into a single load of the file and subsequent ELFIO + // processing. + static const auto cor_deleter = [](hsa_code_object_reader_t* p) { + hsa_code_object_reader_destroy(*p); + }; + + using RAII_code_reader = unique_ptr< + hsa_code_object_reader_t, decltype(cor_deleter)>; + + file.seekg(0); + + vector blob{ + istreambuf_iterator{file}, istreambuf_iterator{}}; + RAII_code_reader tmp{new hsa_code_object_reader_t, cor_deleter}; + hsa_code_object_reader_create_from_memory( + blob.data(), blob.size(), tmp.get()); + + hsa_executable_load_agent_code_object( + executable, agent, *tmp, nullptr, nullptr); + + hsa_executable_freeze(executable, nullptr); + + static vector code_readers; + static mutex mtx; + + lock_guard lck{mtx}; + code_readers.push_back(move(tmp)); + } +} + +namespace hip_impl +{ + const unordered_map& function_names() + { + static constexpr const char self[] = "/proc/self/exe"; + + static unordered_map r; + static once_flag f; + + call_once(f, []() { + elfio reader; + + if (!reader.load(self)) { + throw runtime_error{ + "Failed to load the ELF file for the current process."}; + } + + auto symtab = find_section_if(reader, [](const section* x) { + return x->get_type() == SHT_SYMTAB; + }); + + symbol_section_accessor symbols{reader, symtab}; + + for (auto i = 0u; i != symbols.get_symbols_num(); ++i) { + // TODO: this is boyscout code, caching the temporaries + // may be of worth. + string name; + Elf64_Addr value = 0; + Elf_Xword size = 0; + Elf_Half sect_idx = 0; + uint8_t bind = 0; + uint8_t type = 0; + uint8_t other = 0; + + symbols.get_symbol( + i, name, value, size, bind, type, sect_idx, other); + + if (type == STT_FUNC && sect_idx != SHN_UNDEF && !name.empty()) { + r.emplace(value, name); + } + } + }); + + return r; + } + + const unordered_map< + uintptr_t, vector>>& functions() + { + static unordered_map< + uintptr_t, vector>> r; + static once_flag f; + + call_once(f, []() { + for (auto&& function : function_names()) { + const auto it = kernels().find(function.second); + + if (it != kernels().cend()) { + for (auto&& kernel_symbol : it->second) { + r[function.first].emplace_back( + agent(kernel_symbol), + Kernel_descriptor{ + kernel_object(kernel_symbol), + group_size(kernel_symbol), + private_size(kernel_symbol), + it->first}); + } + } + } + }); + + return r; + } + + hsa_executable_t load_executable( + hsa_executable_t executable, hsa_agent_t agent, istream& file) + { + elfio reader; + if (!reader.load(file)) { + return hsa_executable_t{}; + } + else { + // TODO: this may benefit from caching as well. + elfio self_reader; + self_reader.load("/proc/self/exe"); + + const auto symtab = + find_section_if(self_reader, [](const ELFIO::section* x) { + return x->get_type() == SHT_SYMTAB; + }); + + const auto code_object_dynsym = + find_section_if(reader, [](const ELFIO::section* x) { + return x->get_type() == SHT_DYNSYM; + }); + + associate_code_object_symbols_with_host_allocation( + reader, self_reader, code_object_dynsym, symtab, agent, executable); + + load_code_object_and_freeze_executable(file, agent, executable); + + return executable; + } + } +} // Namespace hip_impl. \ No newline at end of file diff --git a/tests/src/context/hipMemsetD8.cpp b/tests/src/context/hipMemsetD8.cpp index 3730fcb70b..a356d05b76 100644 --- a/tests/src/context/hipMemsetD8.cpp +++ b/tests/src/context/hipMemsetD8.cpp @@ -46,7 +46,6 @@ int main(int argc, char *argv[]) A_h = new char[Nbytes]; HIPCHECK ( hipMalloc((void **) &A_d, Nbytes) ); - A_h = (char*)malloc(Nbytes); printf ("Size=%zu memsetval=%2x \n", Nbytes, memsetval); HIPCHECK ( hipMemsetD8(A_d, memsetval, Nbytes) ); @@ -61,7 +60,7 @@ int main(int argc, char *argv[]) } hipFree((void *) A_d); - free(A_h); + delete [] A_h; passed(); } diff --git a/tests/src/deviceLib/hipTestDevice.cpp b/tests/src/deviceLib/hipTestDevice.cpp index 570f3baaf0..fa85940839 100644 --- a/tests/src/deviceLib/hipTestDevice.cpp +++ b/tests/src/deviceLib/hipTestDevice.cpp @@ -139,7 +139,14 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); + if(passed == 1){ return true; } @@ -174,7 +181,14 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); + if(passed == 1){ return true; } @@ -205,7 +219,13 @@ for(int i=0;i<512;i++){ } } -free(A); +delete [] A; +delete [] B; +delete [] C; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); + if(passed == 1){ return true; } @@ -234,7 +254,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -263,7 +288,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -291,7 +321,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -321,7 +356,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -350,7 +390,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -387,7 +432,16 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +delete [] D; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); +hipFree(Dd); + if(passed == 1){ return true; } @@ -427,7 +481,18 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +delete [] D; +delete [] E; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); +hipFree(Dd); +hipFree(Ed); + if(passed == 1){ return true; } @@ -457,7 +522,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -489,7 +559,14 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); + if(passed == 1){ return true; } @@ -525,7 +602,16 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +delete [] D; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); +hipFree(Dd); + if(passed == 1){ return true; } @@ -565,7 +651,18 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +delete [] D; +delete [] E; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); +hipFree(Dd); +hipFree(Ed); + if(passed == 1){ return true; } @@ -595,7 +692,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -622,7 +724,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -631,7 +738,7 @@ return false; } int main(){ -if(run_sincosf() && run_sincospif() && run_fdividef() && +if(run_sincosf() && run_sincospif() && run_fdividef() && run_llrintf() && run_norm3df() && run_norm4df() && run_normf() && run_rnorm3df() && run_rnorm4df() && run_rnormf() && run_lroundf() && run_llroundf() && diff --git a/tests/src/deviceLib/hipTestDeviceDouble.cpp b/tests/src/deviceLib/hipTestDeviceDouble.cpp index 5bdbbf1b8f..3b919d0cab 100644 --- a/tests/src/deviceLib/hipTestDeviceDouble.cpp +++ b/tests/src/deviceLib/hipTestDeviceDouble.cpp @@ -128,7 +128,14 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); + if(passed == 1){ return true; } @@ -163,7 +170,14 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); + if(passed == 1){ return true; } @@ -193,7 +207,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -221,7 +240,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -249,7 +273,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -278,7 +307,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -306,7 +340,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -343,7 +382,16 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +delete [] D; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); +hipFree(Dd); + if(passed == 1){ return true; } @@ -383,7 +431,18 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +delete [] D; +delete [] E; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); +hipFree(Dd); +hipFree(Ed); + if(passed == 1){ return true; } @@ -416,7 +475,14 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); + if(passed == 1){ return true; } @@ -452,7 +518,16 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +delete [] D; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); +hipFree(Dd); + if(passed == 1){ return true; } @@ -492,7 +567,18 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +delete [] D; +delete [] E; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); +hipFree(Dd); +hipFree(Ed); + if(passed == 1){ return true; } @@ -522,7 +608,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -549,7 +640,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } diff --git a/tests/src/deviceLib/hip_test_ldg.cpp b/tests/src/deviceLib/hip_test_ldg.cpp index 171ff1afd0..5540c4917d 100644 --- a/tests/src/deviceLib/hip_test_ldg.cpp +++ b/tests/src/deviceLib/hip_test_ldg.cpp @@ -159,11 +159,16 @@ bool dataTypesRun(){ HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice)); - hipLaunchKernel(vectoradd_float, - dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), - dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), - 0, 0, - deviceA ,deviceB ,WIDTH ,HEIGHT); + hipLaunchKernel( + vectoradd_float, + dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), + 0, + 0, + deviceA, + static_cast(deviceB), + WIDTH, + HEIGHT); HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(T), hipMemcpyDeviceToHost)); @@ -221,11 +226,16 @@ bool dataTypesRun2(){ HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T))); HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice)); - hipLaunchKernel(vectoradd_float, - dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), - dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), - 0, 0, - deviceA ,deviceB,WIDTH ,HEIGHT); + hipLaunchKernel( + vectoradd_float, + dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), + 0, + 0, + deviceA, + static_cast(deviceB), + WIDTH, + HEIGHT); HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(T), hipMemcpyDeviceToHost)); @@ -281,11 +291,16 @@ bool dataTypesRun4(){ HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice)); - hipLaunchKernel(vectoradd_float, - dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), - dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), - 0, 0, - deviceA ,deviceB ,WIDTH ,HEIGHT); + hipLaunchKernel( + vectoradd_float, + dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), + 0, + 0, + deviceA, + static_cast(deviceB), + WIDTH, + HEIGHT); HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(T), hipMemcpyDeviceToHost)); diff --git a/tests/src/experimental/xcompile/hipxxKer.cpp b/tests/src/experimental/xcompile/hipxxKer.cpp index 79a272aaf2..d1bbed63cd 100644 --- a/tests/src/experimental/xcompile/hipxxKer.cpp +++ b/tests/src/experimental/xcompile/hipxxKer.cpp @@ -36,17 +36,23 @@ __global__ void Kern(hipLaunchParm lp, float *A) int main() { - float *A, *Ad; + float A[len]; + float *Ad; + for(int i=0;i(A_d), + static_cast(B_d), + C_d, + N); HIPCHECK (hipEventRecord(stop, NULL)); diff --git a/tests/src/runtimeApi/event/record_event.cpp b/tests/src/runtimeApi/event/record_event.cpp index bd8a3ada8e..a7b99749cb 100644 --- a/tests/src/runtimeApi/event/record_event.cpp +++ b/tests/src/runtimeApi/event/record_event.cpp @@ -52,7 +52,7 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_ if (!(testMask & p_tests)) { return; } - printf ("\ntest 0x%3x: stream=%p waitStart=%d syncMode=%s\n", + printf ("\ntest 0x%3x: stream=%p waitStart=%d syncMode=%s\n", testMask, stream, waitStart, syncModeString(syncMode)); size_t sizeBytes = numElements * sizeof(int); @@ -77,7 +77,16 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_ HIPCHECK(hipEventRecord(timingDisabled, stream)); // sandwhich a kernel: HIPCHECK(hipEventRecord(start, stream)); - hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, stream, C_d, C_h, numElements, count); + hipLaunchKernelGGL( + HipTest::addCountReverse, + dim3(blocks), + dim3(threadsPerBlock), + 0, + stream, + static_cast(C_d), + C_h, + numElements, + count); HIPCHECK(hipEventRecord(stop, stream)); @@ -85,8 +94,8 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_ HIPCHECK(hipEventSynchronize(start)); } - - hipError_t expectedStopError = hipSuccess; + + hipError_t expectedStopError = hipSuccess; // How to wait for the events to finish: switch (syncMode) { @@ -97,12 +106,12 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_ HIPCHECK(hipStreamSynchronize(stream)); // wait for recording to finish... break; case syncStopEvent: - HIPCHECK(hipEventSynchronize(stop)); + HIPCHECK(hipEventSynchronize(stop)); break; default: assert(0); }; - + float t; @@ -111,25 +120,25 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_ failed ("start event not in expected state, was %d=%s\n", e, hipGetErrorName(e)); } - if (e == hipSuccess) + if (e == hipSuccess) assert (t==0.0f); - + // stop usually ready unless we skipped the synchronization (syncNone) HIPCHECK_API(hipEventElapsedTime(&t, stop, stop), expectedStopError); - if (e == hipSuccess) + if (e == hipSuccess) assert (t==0.0f); e = hipEventElapsedTime(&t, start, stop); HIPCHECK_API(e, expectedStopError); - if (expectedStopError == hipSuccess) + if (expectedStopError == hipSuccess) assert (t>0.0f); printf ("time=%6.2f error=%s\n", t, hipGetErrorName(e)); e = hipEventElapsedTime(&t, stop, start); HIPCHECK_API(e, expectedStopError); - if (expectedStopError == hipSuccess) + if (expectedStopError == hipSuccess) assert (t<0.0f); printf ("negtime=%6.2f error=%s\n", t, hipGetErrorName(e)); diff --git a/tests/src/runtimeApi/memory/hipMemcpy.cpp b/tests/src/runtimeApi/memory/hipMemcpy.cpp index e8e803e44c..b3f25658fc 100644 --- a/tests/src/runtimeApi/memory/hipMemcpy.cpp +++ b/tests/src/runtimeApi/memory/hipMemcpy.cpp @@ -58,7 +58,7 @@ public: void offset(int offset) { _offset = offset; }; int offset() const { return _offset; }; - + private: T * _A_d; T* _B_d; @@ -72,7 +72,7 @@ private: template DeviceMemory::DeviceMemory(size_t numElements) - : _maxNumElements(numElements), + : _maxNumElements(numElements), _offset(0) { T ** np = nullptr; @@ -93,7 +93,7 @@ DeviceMemory::~DeviceMemory () HipTest::freeArrays (_A_d, _B_d, _C_d, np, np, np, 0); HIPCHECK (hipFree(_C_dd)); - + _C_dd = NULL; }; @@ -125,7 +125,7 @@ public: T * A_hh; T* B_hh; - bool _usePinnedHost; + bool _usePinnedHost; private: size_t _maxNumElements; @@ -165,11 +165,11 @@ HostMemory::HostMemory(size_t numElements, bool usePinnedHost) template void -HostMemory::reset(size_t numElements, bool full) +HostMemory::reset(size_t numElements, bool full) { // Initialize the host data: for (size_t i=0; i void memcpytest2(DeviceMemory *dmem, HostMemory *hmem, size_t numElements, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) { size_t sizeElements = numElements * sizeof(T); - printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d, offsets:dev:%+d host:+%d\n", - __func__, + printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d, offsets:dev:%+d host:+%d\n", + __func__, TYPENAME(T), sizeElements, sizeElements/1024.0/1024.0, hmem->_usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault, @@ -243,7 +243,16 @@ void memcpytest2(DeviceMemory *dmem, HostMemory *hmem, size_t numElements, HIPCHECK ( hipMemcpy(dmem->B_d(), hmem->B_h(), sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); } - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, dmem->A_d(), dmem->B_d(), dmem->C_d(), numElements); + hipLaunchKernel( + HipTest::vectorADD, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(dmem->A_d()), + static_cast(dmem->B_d()), + dmem->C_d(), + numElements); if (useDeviceToDevice) { // Do an extra device-to-device copy here to mix things up: @@ -273,8 +282,8 @@ void memcpytest2_for_type(size_t numElements) { printSep(); - DeviceMemory memD(numElements); - HostMemory memU(numElements, 0/*usePinnedHost*/); + DeviceMemory memD(numElements); + HostMemory memU(numElements, 0/*usePinnedHost*/); HostMemory memP(numElements, 1/*usePinnedHost*/); for (int usePinnedHost =0; usePinnedHost<=1; usePinnedHost++) { @@ -307,11 +316,11 @@ void memcpytest2_sizes(size_t maxElem=0) maxElem = free/sizeof(T)/8; } - printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n", + printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n", deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0); HIPCHECK ( hipDeviceReset() ); - DeviceMemory memD(maxElem); - HostMemory memU(maxElem, 0/*usePinnedHost*/); + DeviceMemory memD(maxElem); + HostMemory memU(maxElem, 0/*usePinnedHost*/); HostMemory memP(maxElem, 1/*usePinnedHost*/); for (size_t elem=1; elem<=maxElem; elem*=2) { @@ -336,11 +345,11 @@ void memcpytest2_offsets(size_t maxElem, bool devOffsets, bool hostOffsets) HIPCHECK(hipMemGetInfo(&free, &total)); - printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n", + printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n", deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0); HIPCHECK ( hipDeviceReset() ); - DeviceMemory memD(maxElem); - HostMemory memU(maxElem, 0/*usePinnedHost*/); + DeviceMemory memD(maxElem); + HostMemory memU(maxElem, 0/*usePinnedHost*/); HostMemory memP(maxElem, 1/*usePinnedHost*/); size_t elem = maxElem / 2; @@ -380,16 +389,16 @@ void multiThread_1(bool serialize, bool usePinnedHost) { printSep(); printf ("test: %s<%s> serialize=%d usePinnedHost=%d\n", __func__, TYPENAME(T), serialize, usePinnedHost); - DeviceMemory memD(N); - HostMemory mem1(N, usePinnedHost); - HostMemory mem2(N, usePinnedHost); + DeviceMemory memD(N); + HostMemory mem1(N, usePinnedHost); + HostMemory mem2(N, usePinnedHost); std::thread t1 (memcpytest2, &memD, &mem1, N, 0,0,0); if (serialize) { t1.join(); } - + std::thread t2 (memcpytest2,&memD, &mem2, N, 0,0,0); if (serialize) { t2.join(); @@ -427,21 +436,21 @@ int main(int argc, char *argv[]) // Some tests around the 64KB boundary which have historically shown issues: printf ("\n\n=== tests&0x2 (64KB boundary)\n"); size_t maxElem = 32*1024*1024; - DeviceMemory memD(maxElem); - HostMemory memU(maxElem, 0/*usePinnedHost*/); - HostMemory memP(maxElem, 0/*usePinnedHost*/); + DeviceMemory memD(maxElem); + HostMemory memU(maxElem, 0/*usePinnedHost*/); + HostMemory memP(maxElem, 0/*usePinnedHost*/); // These all pass: - memcpytest2(&memD, &memP, 15*1024*1024, 0, 0, 0); - memcpytest2(&memD, &memP, 16*1024*1024, 0, 0, 0); - memcpytest2(&memD, &memP, 16*1024*1024+16*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 15*1024*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 16*1024*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 16*1024*1024+16*1024, 0, 0, 0); // Just over 64MB: - memcpytest2(&memD, &memP, 16*1024*1024+512*1024, 0, 0, 0); - memcpytest2(&memD, &memP, 17*1024*1024+1024, 0, 0, 0); - memcpytest2(&memD, &memP, 32*1024*1024, 0, 0, 0); - memcpytest2(&memD, &memU, 32*1024*1024, 0, 0, 0); - memcpytest2(&memD, &memP, 32*1024*1024, 1, 1, 0); - memcpytest2(&memD, &memP, 32*1024*1024, 1, 1, 0); + memcpytest2(&memD, &memP, 16*1024*1024+512*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 17*1024*1024+1024, 0, 0, 0); + memcpytest2(&memD, &memP, 32*1024*1024, 0, 0, 0); + memcpytest2(&memD, &memU, 32*1024*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 32*1024*1024, 1, 1, 0); + memcpytest2(&memD, &memP, 32*1024*1024, 1, 1, 0); } @@ -464,7 +473,7 @@ int main(int argc, char *argv[]) // Simplest cases: serialize the threads, and also used pinned memory: // This verifies that the sub-calls to memcpytest2 are correct. - multiThread_1(true, true); + multiThread_1(true, true); // Serialize, but use unpinned memory to stress the unpinned memory xfer path. multiThread_1(true, false); diff --git a/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp b/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp index 316f50c01b..9a09e7e95c 100644 --- a/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp +++ b/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp @@ -63,7 +63,16 @@ void simpleTest1() HIPCHECK ( memcopy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); HIPCHECK ( memcopy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N); + hipLaunchKernel( + HipTest::vectorADD, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(A_d), + static_cast(B_d), + C_d, + N); HIPCHECK ( memcopy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); diff --git a/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp b/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp index 4f73b67ad7..9d274543ab 100644 --- a/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp +++ b/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp @@ -41,8 +41,8 @@ void printSep() // Designed to stress a small number of simple smoke tests template< - typename T=float, - class P=HipTest::Unpinned, + typename T=float, + class P=HipTest::Unpinned, class C=HipTest::Memcpy > void simpleVectorAdd(size_t numElements, int iters, hipStream_t stream) @@ -90,7 +90,16 @@ void simpleVectorAdd(size_t numElements, int iters, hipStream_t stream) // This is the null stream? //hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); - hipLaunchKernel(HipTest::vectorADDReverse, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); + hipLaunchKernel( + HipTest::vectorADDReverse, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(A_d), + static_cast(B_d), + C_d, + numElements); MemTraits::Copy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, stream); diff --git a/tests/src/runtimeApi/stream/hipNullStream.cpp b/tests/src/runtimeApi/stream/hipNullStream.cpp index b610315608..04a232f3bb 100644 --- a/tests/src/runtimeApi/stream/hipNullStream.cpp +++ b/tests/src/runtimeApi/stream/hipNullStream.cpp @@ -119,7 +119,7 @@ void Streamer::reset() { HipTest::setDefaultData(_numElements, _A_h, _B_h, _C_h); H2D(); - + } @@ -128,7 +128,17 @@ void Streamer::enqueAsync() { printf ("testing: %s numElements=%zu size=%6.2fMB\n", __func__, _numElements, _numElements * sizeof(T) / 1024.0/1024.0); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements); - hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _B_d, _C_d, _numElements, p_repeat); + hipLaunchKernel( + vectorADDRepeat, + dim3(blocks), + dim3(threadsPerBlock), + 0, + _stream, + static_cast(_A_d), + static_cast(_B_d), + _C_d, + _numElements, + p_repeat); } @@ -225,7 +235,17 @@ int main(int argc, char *argv[]) auto lastStreamer = streamers[s - 1]; // Dispatch to NULL stream, should wait for prior async activity to complete before beginning: - hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/); + hipLaunchKernel( + vectorADDRepeat, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0/*nullstream*/, + static_cast(lastStreamer->_C_d), + static_cast(lastStreamer->_C_d), + nullStreamer->_C_d, + numElements, + 1/*repeat*/); if (p_db) { @@ -238,7 +258,7 @@ int main(int argc, char *argv[]) nullStreamer->D2H(); HIPCHECK(hipDeviceSynchronize()); - HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); + HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); } } @@ -257,13 +277,23 @@ int main(int argc, char *argv[]) auto lastStreamer = streamers[s - 1]; // Dispatch to NULL stream, should wait for prior async activity to complete before beginning: - hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/); + hipLaunchKernel( + vectorADDRepeat, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0/*nullstream*/, + static_cast(lastStreamer->_C_d), + static_cast(lastStreamer->_C_d), + nullStreamer->_C_d, + numElements, + 1/*repeat*/); nullStreamer->D2H(); HIPCHECK(hipDeviceSynchronize()); - HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); + HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); } } @@ -289,10 +319,10 @@ int main(int argc, char *argv[]) // Copy with stream1, this could go async if the streamSync doesn't synchronize ALL the streams. HIPCHECK(hipMemcpyAsync(streamers[0]->_C_h, streamers[0]->_C_d, streamers[0]->_numElements*sizeof(int), hipMemcpyDeviceToHost, streamers[1]->_stream)); - + HIPCHECK(hipDeviceSynchronize()); - HipTest::checkTest(expected_H, streamers[0]->_C_h, numElements); + HipTest::checkTest(expected_H, streamers[0]->_C_h, numElements); } diff --git a/tests/src/runtimeApi/stream/hipStreamSync2.cpp b/tests/src/runtimeApi/stream/hipStreamSync2.cpp index c6a58ce7d4..962737774d 100644 --- a/tests/src/runtimeApi/stream/hipStreamSync2.cpp +++ b/tests/src/runtimeApi/stream/hipStreamSync2.cpp @@ -59,23 +59,23 @@ const char *syncModeString(int syncMode) { void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode syncMode, bool expectMismatch) { - // This test sends a long-running kernel to the null stream, then tests to see if the + // This test sends a long-running kernel to the null stream, then tests to see if the // specified synchronization technique is effective. // - // Some syncMode are not expected to correctly sync (for example "syncNone"). in these + // Some syncMode are not expected to correctly sync (for example "syncNone"). in these // cases the test sets expectMismatch and the check logic below will attempt to ensure that // the undesired synchronization did not occur - ie ensure the kernel is still running and did // not yet update the stop event. This can be tricky since if the kernel runs fast enough it - // may complete before the check. To prevent this, the addCountReverse has a count parameter - // which causes it to loop repeatedly, and the results are checked in reverse order. + // may complete before the check. To prevent this, the addCountReverse has a count parameter + // which causes it to loop repeatedly, and the results are checked in reverse order. // // Tests with expectMismatch=true should ensure the kernel finishes correctly. This results // are checked and we test to make sure stop event has completed. - + if (!(testMask & p_tests)) { return; } - printf ("\ntest 0x%02x: syncMode=%s expectMismatch=%d\n", + printf ("\ntest 0x%02x: syncMode=%s expectMismatch=%d\n", testMask, syncModeString(syncMode), expectMismatch); size_t sizeBytes = numElements * sizeof(int); @@ -97,8 +97,17 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode s unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); // Launch kernel into null stream, should result in C_h == count. - hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, 0 /*stream*/, C_d, C_h, numElements, count); - HIPCHECK(hipEventRecord(stop, 0/*default*/)); + hipLaunchKernelGGL( + HipTest::addCountReverse, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0 /*stream*/, + static_cast(C_d), + C_h, + numElements, + count); + HIPCHECK(hipEventRecord(stop, 0/*default*/)); switch (syncMode) { case syncNone: @@ -108,18 +117,18 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode s break; case syncOtherStream: // Does this synchronize with the null stream? - HIPCHECK(hipStreamSynchronize(otherStream)); + HIPCHECK(hipStreamSynchronize(otherStream)); break; case syncMarkerThenOtherStream: case syncMarkerThenOtherNonBlockingStream: - - // this may wait for NULL stream depending hipStreamNonBlocking flag above - HIPCHECK(hipEventRecord(otherStreamEvent, otherStream)); - HIPCHECK(hipStreamSynchronize(otherStream)); + // this may wait for NULL stream depending hipStreamNonBlocking flag above + HIPCHECK(hipEventRecord(otherStreamEvent, otherStream)); + + HIPCHECK(hipStreamSynchronize(otherStream)); break; case syncDevice: - HIPCHECK(hipDeviceSynchronize()); + HIPCHECK(hipDeviceSynchronize()); break; default: assert(0); @@ -197,7 +206,7 @@ void runTests(int64_t numElements) int main(int argc, char *argv[]) { // Can' destroy the default stream:// TODO - move to another test - HIPCHECK_API(hipStreamDestroy(0), hipErrorInvalidResourceHandle); + HIPCHECK_API(hipStreamDestroy(0), hipErrorInvalidResourceHandle); HipTest::parseStandardArguments(argc, argv, true /*failOnUndefinedArg*/); diff --git a/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp b/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp index f5b1b79550..a7a930b4f6 100644 --- a/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp +++ b/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp @@ -88,7 +88,7 @@ private: template Streamer::Streamer(int deviceId, T * A_d, size_t numElements, int commandType) : - _preA_d(NULL), + _preA_d(NULL), _A_d(A_d), _deviceId(deviceId), _numElements(numElements), @@ -163,9 +163,27 @@ void Streamer::runAsyncAfter(Streamer *depStreamer, bool waitSameStream) unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements); if (_commandType == COMMAND_ADD_REVERSE) { - hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); + hipLaunchKernelGGL( + HipTest::addCountReverse, + dim3(blocks), + dim3(threadsPerBlock), + 0, + _stream, + static_cast(_A_d), + _C_d, + static_cast(_numElements), + static_cast(p_count)); } else if (_commandType == COMMAND_ADD_FORWARD) { - hipLaunchKernelGGL(HipTest::addCount, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); + hipLaunchKernelGGL( + HipTest::addCount, + dim3(blocks), + dim3(threadsPerBlock), + 0, + _stream, + static_cast(_A_d), + _C_d, + _numElements, + static_cast(p_count)); } else if (_commandType == COMMAND_COPY) { HIPCHECK(hipMemcpyAsync(_C_d, _A_d, _numElements * sizeof(T), hipMemcpyDeviceToDevice, _stream)); } else { @@ -239,7 +257,7 @@ size_t Streamer::check(int streamerNum, T initValue, T expectedOffset, bool e return _mismatchCount; } - + //--- //Parse arguments specific to this test. @@ -300,7 +318,7 @@ void checkAll(int initValue, std::vector &streamers, std::vector< for (int i=0; iexpectedAdd(); - + mismatchCount += streamers[i]->check(i+1, initValue, expected, expectPass); } @@ -330,7 +348,7 @@ void checkAll(int initValue, std::vector &streamers, std::vector< void sync_none(void) {}; -void sync_allDevices(int numDevices) +void sync_allDevices(int numDevices) { for (int d=0; d streamers) +void sync_queryAllUntilComplete(std::vector streamers) { for (int i=streamers.size()-1; i>=0; i--) { streamers[i]->queryUntilComplete(); @@ -347,7 +365,7 @@ void sync_queryAllUntilComplete(std::vector streamers) } -void sync_streamWaitEvent(hipEvent_t lastEvent, int sideDeviceId, hipStream_t sideStream, bool waitHere) +void sync_streamWaitEvent(hipEvent_t lastEvent, int sideDeviceId, hipStream_t sideStream, bool waitHere) { HIPCHECK(hipSetDevice(sideDeviceId)); @@ -389,7 +407,7 @@ int main(int argc, char *argv[]) initArray_h[i] = initValue; } HIPCHECK(hipMemcpy(initArray_d, initArray_h, sizeElements, hipMemcpyHostToDevice)); - + int numDevices; HIPCHECK(hipGetDeviceCount(&numDevices)); @@ -414,7 +432,7 @@ int main(int argc, char *argv[]) // A sideband stream channel that is independent from above. - // Used to check to ensure the WaitEvent or other synchronization is working correctly since by default sideStream is + // Used to check to ensure the WaitEvent or other synchronization is working correctly since by default sideStream is // asynchronous wrt the other streams. std::vector sideStreams; for (int d=0; d Test 0x1000 simple null stream tests\n"); + printf ("==> Test 0x1000 simple null stream tests\n"); // try some null stream: hipStreamQuery(0); @@ -463,7 +481,7 @@ int main(int argc, char *argv[]) HIPCHECK(hipEventRecord(e1, s1)) HIPCHECK(hipStreamWaitEvent(hipStream_t(0), e1, 0/*flags*/)); - + HIPCHECK(hipStreamDestroy(s1)); HIPCHECK(hipEventDestroy(e1)); } @@ -476,11 +494,11 @@ int main(int argc, char *argv[]) HIPCHECK(hipEventRecord(e1, hipStream_t(0))) HIPCHECK(hipStreamWaitEvent(s1, e1, 0/*flags*/)); - + HIPCHECK(hipStreamDestroy(s1)); HIPCHECK(hipEventDestroy(e1)); } - + } diff --git a/tests/src/stress/hipStressKernel.cpp b/tests/src/stress/hipStressKernel.cpp index 7b5eec5a80..52d8fa1fe9 100644 --- a/tests/src/stress/hipStressKernel.cpp +++ b/tests/src/stress/hipStressKernel.cpp @@ -57,5 +57,8 @@ int main(){ } std::cout<