From c2482d1255ec6ea2f45aae872db001ce796bde1f Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 1 Nov 2017 15:09:59 +0000 Subject: [PATCH 01/35] This switches HIP from its currently convoluted macro + pfe based dispatch mechanism to a more natural one partially based on the existing module API. The basic idea is that HCC will always correctly emit __global__ functions: as empty-bodied stubs, on host, and as kernels, on device. It then becomes trivial to obtain the mangled name on host, at dispatch, from the function's address, and then to use the mangled name to retrieve the kernel. This should address all problems stemming from serialisation, dubious mismatches due to the manufactured functor, macro-isms et al. It also immediately enables support for generalised globals as a consequence of that being available in the module API. Finally, it will make debug much easier, since the actual names of the __global__ functions will automatically be used in traces etc. One detail is that due to how dispatch works now (hipLaunchKernel and hipLaunchKernelGGL are themselves variadic function templates which deduce the function type of the callee), in certain cases it may be necesssary to insert explicit casts to ensure that the variadic argument list selects a viable overload - this can be observed in some unit tests. Eventually we may be able to remove this limitation, but for now it does not appear terribly onerous. The code is not extremely HIPpie, nor is it fully optimised, but rather is intended as a starting point for the HIP team to make its own. --- include/hip/hcc_detail/code_object_bundle.hpp | 134 +++ include/hip/hcc_detail/grid_launch_GGL.hpp | 1059 ++--------------- include/hip/hcc_detail/hip_runtime.h | 4 +- include/hip/hcc_detail/host_defines.h | 3 +- include/hip/hcc_detail/program_state.hpp | 60 + src/code_object_bundle.cpp | 39 + src/grid_launch.cpp | 142 ++- src/hip_hcc_internal.h | 2 +- src/hip_memory.cpp | 20 +- src/hip_module.cpp | 197 +-- src/program_state.cpp | 498 ++++++++ tests/src/context/hipMemsetD8.cpp | 3 +- tests/src/deviceLib/hipTestDevice.cpp | 141 ++- tests/src/deviceLib/hipTestDeviceDouble.cpp | 124 +- tests/src/deviceLib/hip_test_ldg.cpp | 45 +- tests/src/experimental/xcompile/hipxxKer.cpp | 10 +- tests/src/kernel/hipLanguageExtensions.cpp | 4 +- tests/src/kernel/hipTestMemKernel.cpp | 45 +- tests/src/runtimeApi/event/hipEventRecord.cpp | 11 +- tests/src/runtimeApi/event/record_event.cpp | 31 +- tests/src/runtimeApi/memory/hipMemcpy.cpp | 77 +- .../runtimeApi/memory/hipMemcpy_simple.cpp | 11 +- .../multiThread/hipMultiThreadStreams1.cpp | 15 +- tests/src/runtimeApi/stream/hipNullStream.cpp | 46 +- .../src/runtimeApi/stream/hipStreamSync2.cpp | 39 +- .../runtimeApi/stream/hipStreamWaitEvent.cpp | 46 +- tests/src/stress/hipStressKernel.cpp | 3 + 27 files changed, 1457 insertions(+), 1352 deletions(-) create mode 100644 include/hip/hcc_detail/code_object_bundle.hpp create mode 100644 include/hip/hcc_detail/program_state.hpp create mode 100644 src/code_object_bundle.cpp create mode 100644 src/program_state.cpp diff --git a/include/hip/hcc_detail/code_object_bundle.hpp b/include/hip/hcc_detail/code_object_bundle.hpp new file mode 100644 index 0000000000..080132c561 --- /dev/null +++ b/include/hip/hcc_detail/code_object_bundle.hpp @@ -0,0 +1,134 @@ +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace hip_impl +{ + hsa_isa_t triple_to_hsa_isa(const std::string& triple); + + struct Bundled_code { + union { + struct { + std::uint64_t offset; + std::uint64_t bundle_sz; + std::uint64_t triple_sz; + }; + std::uint8_t cbuf[ + sizeof(offset) + sizeof(bundle_sz) + sizeof(triple_sz)]; + }; + std::string triple; + std::vector blob; + }; + + class Bundled_code_header { + // DATA - STATICS + static constexpr const char magic_string_[] = + "__CLANG_OFFLOAD_BUNDLE__"; + static constexpr auto magic_string_sz_ = sizeof(magic_string_) - 1; + + // DATA + union { + struct { + std::uint8_t bundler_magic_string_[magic_string_sz_]; + std::uint64_t bundle_cnt_; + }; + std::uint8_t cbuf_[ + sizeof(bundler_magic_string_) + sizeof(bundle_cnt_)]; + }; + std::vector bundles_; + + // FRIENDS - MANIPULATORS + template + friend + inline + bool read( + RandomAccessIterator f, + RandomAccessIterator l, + Bundled_code_header& x) + { + std::copy_n(f, sizeof(x.cbuf_), x.cbuf_); + + if (valid(x)) { + x.bundles_.resize(x.bundle_cnt_); + + auto it = f + sizeof(x.cbuf_); + for (auto&& y : x.bundles_) { + std::copy_n(it, sizeof(y.cbuf), y.cbuf); + it += sizeof(y.cbuf); + + y.triple.insert(y.triple.cend(), it, it + y.triple_sz); + + std::copy_n( + f + y.offset, y.bundle_sz, std::back_inserter(y.blob)); + + it += y.triple_sz; + } + + return true; + } + + return false; + } + friend + inline + bool read(const std::vector& blob, Bundled_code_header& x) + { + return read(blob.cbegin(), blob.cend(), x); + } + friend + inline + bool read(std::istream& is, Bundled_code_header& x) + { + return read(std::vector{ + std::istreambuf_iterator{is}, + std::istreambuf_iterator{}}, + x); + } + + // FRIENDS - ACCESSORS + friend + inline + bool valid(const Bundled_code_header& x) + { + return std::equal( + x.bundler_magic_string_, + x.bundler_magic_string_ + magic_string_sz_, + x.magic_string_); + } + friend + inline + const std::vector& bundles(const Bundled_code_header& x) + { + return x.bundles_; + } + public: + // CREATORS + Bundled_code_header() = default; + template + Bundled_code_header(RandomAccessIterator f, RandomAccessIterator l); + explicit + Bundled_code_header(const std::vector& blob); + Bundled_code_header(const Bundled_code_header&) = default; + Bundled_code_header(Bundled_code_header&&) = default; + ~Bundled_code_header() = default; + + // MANIPULATORS + Bundled_code_header& operator=(const Bundled_code_header&) = default; + Bundled_code_header& operator=(Bundled_code_header&&) = default; + }; + + // CREATORS + template + Bundled_code_header::Bundled_code_header(I f, I l) : Bundled_code_header{} + { + read(f, l, *this); + } +} // Namespace hip_impl. \ No newline at end of file diff --git a/include/hip/hcc_detail/grid_launch_GGL.hpp b/include/hip/hcc_detail/grid_launch_GGL.hpp index 4c632f9d68..e3fa3331ac 100644 --- a/include/hip/hcc_detail/grid_launch_GGL.hpp +++ b/include/hip/hcc_detail/grid_launch_GGL.hpp @@ -24,984 +24,139 @@ THE SOFTWARE. #if GENERIC_GRID_LAUNCH == 1 +#include "code_object_bundle.hpp" #include "concepts.hpp" #include "helpers.hpp" +#include "program_state.hpp" #include "hc.hpp" #include "hip/hip_hcc.h" #include "hip_runtime.h" +#include +#include #include #include +#include #include +#include +#include #include +#include #include +#include namespace hip_impl { - namespace + template< + typename T, + typename std::enable_if{}>::type* = nullptr> + inline + T round_up_to_next_multiple_nonnegative(T x, T y) { - struct New_grid_launch_tag {}; - struct Old_grid_launch_tag {}; - - template - class RAII_guard { - D dtor_; - public: - RAII_guard() = default; - - RAII_guard(const C& ctor, D dtor) : dtor_{std::move(dtor)} - { - ctor(); - } - - RAII_guard(const RAII_guard&) = default; - RAII_guard(RAII_guard&&) = default; - - RAII_guard& operator=(const RAII_guard&) = default; - RAII_guard& operator=(RAII_guard&&) = default; - - ~RAII_guard() { dtor_(); } - }; - - template - RAII_guard make_RAII_guard(const C& ctor, D dtor) - { - return RAII_guard{ctor, std::move(dtor)}; - } - - template - using is_new_grid_launch_t = typename std::conditional< - is_callable{}, - New_grid_launch_tag, - Old_grid_launch_tag>::type; + T tmp = x + y - 1; + return tmp - tmp % y; } - // TODO: - dispatch rank should be derived from the domain dimensions passed - // in, and not always assumed to be 3; - - template - requires(Domain == {Ts...}) inline - void grid_launch_hip_impl_( - New_grid_launch_tag, - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - const hc::accelerator_view& acc_v, - K k) + std::vector make_kernarg() { - const auto d = hc::extent<3>{ - num_blocks.z * dim_blocks.z, - num_blocks.y * dim_blocks.y, - num_blocks.x * dim_blocks.x}.tile_with_dynamic( - dim_blocks.z, - dim_blocks.y, - dim_blocks.x, - group_mem_bytes); - - try { - hc::parallel_for_each(acc_v, d, k); - } - catch (std::exception& ex) { - std::cerr << "Failed in " << __func__ << ", with exception: " - << ex.what() << std::endl; - throw; - } + return {}; } - // TODO: these are workarounds, they should be removed. - - hc::accelerator_view lock_stream_hip_(hipStream_t&, void*&); - void print_prelaunch_trace_(const char*, dim3, dim3, int, hipStream_t); - void unlock_stream_hip_( - hipStream_t, void*, const char*, hc::accelerator_view*); - - template - requires(Domain == {Ts...}) inline - void grid_launch_hip_impl_( - New_grid_launch_tag, - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, + std::vector make_kernarg(std::vector kernarg) + { + return kernarg; + } + + template + inline + std::vector make_kernarg(std::vector kernarg, T x) + { + kernarg.resize( + round_up_to_next_multiple_nonnegative(kernarg.size(), alignof(T)) + + sizeof(T)); + + new (kernarg.data() + kernarg.size() - sizeof(T)) T{std::move(x)}; + + return kernarg; + } + + template + inline + std::vector make_kernarg( + std::vector kernarg, T x, Ts... xs) + { + return make_kernarg( + make_kernarg(std::move(kernarg), std::move(x)), std::move(xs)...); + } + + template + inline + std::vector make_kernarg(Ts... xs) + { + std::vector kernarg; + kernarg.reserve(sizeof(std::tuple)); + + return make_kernarg(std::move(kernarg), std::move(xs)...); + } + + void hipLaunchKernelGGLImpl( + std::uintptr_t function_address, + const dim3& numBlocks, + const dim3& dimBlocks, + std::uint32_t sharedMemBytes, hipStream_t stream, - const char* kernel_name, - K k) - { - void* lck_stream = nullptr; - auto acc_v = lock_stream_hip_(stream, lck_stream); - auto stream_guard = make_RAII_guard( - std::bind( - print_prelaunch_trace_, - kernel_name, - num_blocks, - dim_blocks, - group_mem_bytes, - stream), - std::bind( - unlock_stream_hip_, stream, lck_stream, kernel_name, &acc_v)); + void** kernarg); +} // Namespace hip_impl. - try { - grid_launch_hip_impl_( - New_grid_launch_tag{}, - std::move(num_blocks), - std::move(dim_blocks), - group_mem_bytes, - acc_v, - std::move(k)); - } - catch (std::exception& ex) { - std::cerr << "Failed in " << __func__ << ", with exception: " - << ex.what() << std::endl; - throw; - } - } +template +inline +void hipLaunchKernelGGL( + F kernel, + const dim3& numBlocks, + const dim3& dimBlocks, + std::uint32_t sharedMemBytes, + hipStream_t stream, + Args... args) +{ + auto kernarg = hip_impl::make_kernarg(std::move(args)...); + std::size_t kernarg_size = kernarg.size(); - template - requires(Domain == {hipLaunchParm, Ts...}) - inline - void grid_launch_hip_impl_( - Old_grid_launch_tag, - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream, - K k) - { - grid_launch_hip_impl_( - New_grid_launch_tag{}, - std::move(num_blocks), - std::move(dim_blocks), - group_mem_bytes, - std::move(stream), - std::move(k)); - } + void* config[] = { + HIP_LAUNCH_PARAM_BUFFER_POINTER, kernarg.data(),//&kernarg, + HIP_LAUNCH_PARAM_BUFFER_SIZE, &kernarg_size, + HIP_LAUNCH_PARAM_END + }; - template - requires(Domain == {hipLaunchParm, Ts...}) - inline - void grid_launch_hip_impl_( - Old_grid_launch_tag, - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream, - const char* kernel_name, - K k) - { - grid_launch_hip_impl_( - New_grid_launch_tag{}, - std::move(num_blocks), - std::move(dim_blocks), - group_mem_bytes, - std::move(stream), - kernel_name, - std::move(k)); - } - - template - requires(Domain == {Ts...}) - inline - std::enable_if_t::value> grid_launch_hip_( - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream, - const char* kernel_name, - K k) - { - grid_launch_hip_impl_( - is_new_grid_launch_t{}, - std::move(num_blocks), - std::move(dim_blocks), - group_mem_bytes, - std::move(stream), - kernel_name, - std::move(k)); - } - - template - requires(Domain == {Ts...}) - inline - std::enable_if_t::value> grid_launch_hip_( - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream, - K k) - { - grid_launch_hip_impl_( - is_new_grid_launch_t{}, - std::move(num_blocks), - std::move(dim_blocks), - group_mem_bytes, - std::move(stream), - std::move(k)); - } - - // TODO: these are temporary and purposefully noisy and disruptive. - #define make_kernel_name_hip(k, n)\ - HIP_kernel_functor_name_begin ## _ ## k ## _ ## \ - HIP_kernel_functor_name_end ## _ ## n - - #define make_kernel_functor_hip_30(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ - p24, p25, p26, p27)\ - struct make_kernel_name_hip(function_name, 28) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - std::decay_t _p21_;\ - std::decay_t _p22_;\ - std::decay_t _p23_;\ - std::decay_t _p24_;\ - std::decay_t _p25_;\ - std::decay_t _p26_;\ - std::decay_t _p27_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_,\ - _p26_, _p27_);\ - }\ - } - #define make_kernel_functor_hip_29(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ - p24, p25, p26)\ - struct make_kernel_name_hip(function_name, 27) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - std::decay_t _p21_;\ - std::decay_t _p22_;\ - std::decay_t _p23_;\ - std::decay_t _p24_;\ - std::decay_t _p25_;\ - std::decay_t _p26_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_,\ - _p26_);\ - }\ - } - #define make_kernel_functor_hip_28(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ - p24, p25)\ - struct make_kernel_name_hip(function_name, 26) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - std::decay_t _p21_;\ - std::decay_t _p22_;\ - std::decay_t _p23_;\ - std::decay_t _p24_;\ - std::decay_t _p25_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_);\ - }\ - } - #define make_kernel_functor_hip_27(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ - p24)\ - struct make_kernel_name_hip(function_name, 25) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - std::decay_t _p21_;\ - std::decay_t _p22_;\ - std::decay_t _p23_;\ - std::decay_t _p24_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_);\ - }\ - } - #define make_kernel_functor_hip_26(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23)\ - struct make_kernel_name_hip(function_name, 24) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - std::decay_t _p21_;\ - std::decay_t _p22_;\ - std::decay_t _p23_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_, _p23_);\ - }\ - } - #define make_kernel_functor_hip_25(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22)\ - struct make_kernel_name_hip(function_name, 23) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - std::decay_t _p21_;\ - std::decay_t _p22_;\ - __attribute__((used, flatten))\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_);\ - }\ - } - #define make_kernel_functor_hip_24(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21)\ - struct make_kernel_name_hip(function_name, 22) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - std::decay_t _p21_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_);\ - }\ - } - #define make_kernel_functor_hip_23(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20)\ - struct make_kernel_name_hip(function_name, 21) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_);\ - }\ - } - #define make_kernel_functor_hip_22(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19)\ - struct make_kernel_name_hip(function_name, 20) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_);\ - }\ - } - #define make_kernel_functor_hip_21(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18)\ - struct make_kernel_name_hip(function_name, 19) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_);\ - }\ - } - #define make_kernel_functor_hip_20(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17)\ - struct make_kernel_name_hip(function_name, 18) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_);\ - }\ - } - #define make_kernel_functor_hip_19(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16)\ - struct make_kernel_name_hip(function_name, 17) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_);\ - }\ - } - #define make_kernel_functor_hip_18(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15)\ - struct make_kernel_name_hip(function_name, 16) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_);\ - }\ - } - #define make_kernel_functor_hip_17(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14)\ - struct make_kernel_name_hip(function_name, 15) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_);\ - }\ - } - #define make_kernel_functor_hip_16(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13)\ - struct make_kernel_name_hip(function_name, 14) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_);\ - }\ - } - #define make_kernel_functor_hip_15(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12)\ - struct make_kernel_name_hip(function_name, 13) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_);\ - }\ - } - #define make_kernel_functor_hip_14(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11)\ - struct make_kernel_name_hip(function_name, 12) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_);\ - }\ - } - #define make_kernel_functor_hip_13(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10)\ - struct make_kernel_name_hip(function_name, 11) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_);\ - }\ - } - #define make_kernel_functor_hip_12(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9)\ - struct make_kernel_name_hip(function_name, 10) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_);\ - }\ - } - #define make_kernel_functor_hip_11(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8)\ - struct make_kernel_name_hip(function_name, 9) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_);\ - }\ - } - #define make_kernel_functor_hip_10(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7)\ - struct make_kernel_name_hip(function_name, 8) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_);\ - }\ - } - #define make_kernel_functor_hip_9(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6)\ - struct make_kernel_name_hip(function_name, 7) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_);\ - }\ - } - #define make_kernel_functor_hip_8(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5)\ - struct make_kernel_name_hip(function_name, 6) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_);\ - }\ - } - #define make_kernel_functor_hip_7(\ - function_name, kernel_name, p0, p1, p2, p3, p4)\ - struct make_kernel_name_hip(function_name, 5) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_);\ - }\ - } - #define make_kernel_functor_hip_6(function_name, kernel_name, p0, p1, p2, p3)\ - struct make_kernel_name_hip(function_name, 4) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_, _p1_, _p2_, _p3_);\ - }\ - } - #define make_kernel_functor_hip_5(function_name, kernel_name, p0, p1, p2)\ - struct make_kernel_name_hip(function_name, 3) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_, _p1_, _p2_);\ - }\ - } - #define make_kernel_functor_hip_4(function_name, kernel_name, p0, p1)\ - struct make_kernel_name_hip(function_name, 2) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_, _p1_);\ - }\ - } - #define fofo(f, n) kernel_prefix_hip ## f ## kernel_suffix_hip ## n - #define make_kernel_functor_hip_3(function_name, kernel_name, p0)\ - struct make_kernel_name_hip(function_name, 1) {\ - std::decay_t _p0_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_);\ - }\ - } - #define make_kernel_functor_hip_2(function_name, kernel_name)\ - struct make_kernel_name_hip(function_name, 0) {\ - void operator()(const hc::tiled_index<3>&) [[hc]]\ - {\ - return kernel_name(hipLaunchParm{});\ - }\ - } - #define make_kernel_functor_hip_1(...) - #define make_kernel_functor_hip_0(...) - #define make_kernel_functor_hip_(...)\ - overload_macro_hip_(make_kernel_functor_hip_, __VA_ARGS__) - - - #define hipLaunchNamedKernelGGL(\ - function_name,\ - kernel_name,\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - ...)\ - do {\ - make_kernel_functor_hip_(function_name, kernel_name, __VA_ARGS__)\ - hip_kernel_functor_impl_{__VA_ARGS__};\ - hip_impl::grid_launch_hip_(\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - #kernel_name,\ - hip_kernel_functor_impl_);\ - } while(0) - - #define hipLaunchKernelGGL(\ - kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)\ - do {\ - hipLaunchNamedKernelGGL(\ - unnamed,\ - kernel_name,\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - ##__VA_ARGS__);\ - } while (0) - - #define hipLaunchKernel(\ - kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)\ - do {\ - hipLaunchKernelGGL(\ - kernel_name,\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - hipLaunchParm{},\ - ##__VA_ARGS__);\ - } while(0) + hip_impl::hipLaunchKernelGGLImpl( + reinterpret_cast(kernel), + numBlocks, + dimBlocks, + sharedMemBytes, + stream, + &config[0]); } + +template +inline +void hipLaunchKernel( + F kernel, + const dim3& numBlocks, + const dim3& dimBlocks, + std::uint32_t groupMemBytes, + hipStream_t stream, + Args... args) +{ + hipLaunchKernelGGL( + kernel, + numBlocks, + dimBlocks, + groupMemBytes, + stream, + hipLaunchParm{}, + std::move(args)...); +} + #endif //GENERIC_GRID_LAUNCH diff --git a/include/hip/hcc_detail/hip_runtime.h b/include/hip/hcc_detail/hip_runtime.h index 370ac2abbb..d3211ed3f5 100644 --- a/include/hip/hcc_detail/hip_runtime.h +++ b/include/hip/hcc_detail/hip_runtime.h @@ -53,7 +53,7 @@ THE SOFTWARE. // define HIP_ENABLE_PRINTF to enable printf #ifdef HIP_ENABLE_PRINTF #define HCC_ENABLE_ACCELERATOR_PRINTF 1 -#endif +#endif //--- // Remainder of this file only compiles with HCC @@ -481,7 +481,7 @@ do {\ type* var = \ (type*)__get_dynamicgroupbaseptr(); \ -#define HIP_DYNAMIC_SHARED_ATTRIBUTE +#define HIP_DYNAMIC_SHARED_ATTRIBUTE diff --git a/include/hip/hcc_detail/host_defines.h b/include/hip/hcc_detail/host_defines.h index b2e7ac2617..56cfa0cc0f 100644 --- a/include/hip/hcc_detail/host_defines.h +++ b/include/hip/hcc_detail/host_defines.h @@ -44,7 +44,8 @@ THE SOFTWARE. #if GENERIC_GRID_LAUNCH == 0 #define __global__ __attribute__((hc_grid_launch)) __attribute__((used)) #else -#define __global__ __attribute__((annotate("hip__global__"), hc, used, weak)) +#define __global__ \ + __attribute__((annotate("__HIP_global_function__"), cpu, hc, used)) #endif //GENERIC_GRID_LAUNCH #define __noinline__ __attribute__((noinline)) diff --git a/include/hip/hcc_detail/program_state.hpp b/include/hip/hcc_detail/program_state.hpp new file mode 100644 index 0000000000..03701725eb --- /dev/null +++ b/include/hip/hcc_detail/program_state.hpp @@ -0,0 +1,60 @@ +/* +Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include + +struct ihipModuleSymbol_t; +using hipFunction_t = ihipModuleSymbol_t*; + +namespace hip_impl +{ + struct Kernel_descriptor { + std::uint64_t kernel_object_; + std::uint32_t group_size_; + std::uint32_t private_size_; + std::string name_; + + operator hipFunction_t() const + { // TODO: this is awful and only meant for illustration. + return reinterpret_cast( + const_cast(this)); + } + }; + + const std::unordered_map< + std::uintptr_t, + std::vector>>& functions(); + const std::unordered_map& function_names(); + + hsa_executable_t load_executable( + hsa_executable_t executable, hsa_agent_t agent, std::istream& file); +} // Namespace hip_impl. \ No newline at end of file diff --git a/src/code_object_bundle.cpp b/src/code_object_bundle.cpp new file mode 100644 index 0000000000..d7d2cd1e10 --- /dev/null +++ b/src/code_object_bundle.cpp @@ -0,0 +1,39 @@ +#include "../include/hip/hcc_detail/code_object_bundle.hpp" + +#include + +#include +#include +#include + +hsa_isa_t hip_impl::triple_to_hsa_isa(const std::string& triple) +{ + static constexpr const char prefix[] = "hcc-amdgcn--amdhsa-gfx"; + static constexpr std::size_t prefix_sz = sizeof(prefix) - 1; + + hsa_isa_t r = {}; + + auto idx = triple.find(prefix); + + if (idx != std::string::npos) { + idx += prefix_sz; + std::string tmp = "AMD:AMDGPU"; + while (idx != triple.size()) { + tmp.push_back(':'); + tmp.push_back(triple[idx++]); + } + + hsa_isa_from_name(tmp.c_str(), &r); + } + + return r; +} + +// DATA - STATICS +constexpr const char hip_impl::Bundled_code_header::magic_string_[]; + +// CREATORS +hip_impl::Bundled_code_header::Bundled_code_header( + const std::vector& x) + : Bundled_code_header{x.cbegin(), x.cend()} +{} \ No newline at end of file diff --git a/src/grid_launch.cpp b/src/grid_launch.cpp index fd5c2a1573..4a26f66c8c 100644 --- a/src/grid_launch.cpp +++ b/src/grid_launch.cpp @@ -21,76 +21,118 @@ THE SOFTWARE. */ #include "hip/hcc_detail/grid_launch_GGL.hpp" +#include "hip/hcc_detail/program_state.hpp" + +#include "hip/hip_runtime_api.h" // Internal header, do not percolate upwards. #include "hip_hcc_internal.h" #include "hc.hpp" #include "trace_helper.h" +#include +#include +#include +#include + #include -#include + +using namespace hc; +using namespace std; namespace hip_impl { - hc::accelerator_view lock_stream_hip_( - hipStream_t& stream, void*& locked_stream) - { // This allocated but does not take ownership of locked_stream. If it is - // not deleted elsewhere it will leak. - using L = decltype(stream->lockopen_preKernelCommand()); - - HIP_INIT(); - - stream = ihipSyncAndResolveStream(stream); - locked_stream = new L{stream->lockopen_preKernelCommand()}; - return (*static_cast(locked_stream))->_av; - } - - void print_prelaunch_trace_( - const char* kernel_name, - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream) + namespace { - if ((HIP_TRACE_API & (1 << TRACE_KCMD)) || - HIP_PROFILE_API || - (COMPILE_HIP_DB && (HIP_TRACE_API & (1<second; + } + + inline + string name(hsa_agent_t agent) + { + char n[64] = {}; + hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, n); + + return string{n}; + } + + inline + hsa_agent_t target_agent(hipStream_t stream) + { + if (stream) { + return *static_cast( + stream->locked_getAv()->get_hsa_agent()); + } + else if ( + ihipGetTlsDefaultCtx() && ihipGetTlsDefaultCtx()->getDevice()) { + return ihipGetDevice( + ihipGetTlsDefaultCtx()->getDevice()->_deviceId)->_hsaAgent; + } + else { + return *static_cast( + accelerator{}.get_default_view().get_hsa_agent()); } } } - void unlock_stream_hip_( + void hipLaunchKernelGGLImpl( + uintptr_t function_address, + const dim3& numBlocks, + const dim3& dimBlocks, + uint32_t sharedMemBytes, hipStream_t stream, - void* locked_stream, - const char* kernel_name, - hc::accelerator_view* acc_v) - { // Precondition: acc_v is the accelerator_view associated with stream - // which is guarded by locked_stream; - // locked_stream is deletable. - using L = decltype(stream->lockopen_preKernelCommand()); + void** kernarg) + { + const auto it0 = functions().find(function_address); - stream->lockclose_postKernelCommand(kernel_name, acc_v); + if (it0 == functions().cend()) { + throw runtime_error{ + "No device code available for function: " + + name(function_address) + }; + } - delete static_cast(locked_stream); - locked_stream = nullptr; + auto agent = target_agent(stream); + + const auto it1 = find_if( + it0->second.cbegin(), + it0->second.cend(), + [=](const pair& x) { + return x.first.handle == agent.handle; + }); + + if (it1 == it0->second.cend()) { + throw runtime_error{ + "No code available for function: " + name(function_address) + + ", for agent: " + name(agent) + }; + } + + for (auto&& agent_kernel : it0->second) { + if (agent.handle == agent_kernel.first.handle) { + hipModuleLaunchKernel( + agent_kernel.second, + numBlocks.x, + numBlocks.y, + numBlocks.z, + dimBlocks.x, + dimBlocks.y, + dimBlocks.z, + sharedMemBytes, + stream, + nullptr, + kernarg); + } + } } } diff --git a/src/hip_hcc_internal.h b/src/hip_hcc_internal.h index 197cd35bfa..503bebcd6a 100644 --- a/src/hip_hcc_internal.h +++ b/src/hip_hcc_internal.h @@ -233,7 +233,7 @@ static const DbName dbName [] = #if COMPILE_HIP_DB #define tprintf(trace_level, ...) {\ if (HIP_DB & (1<<(trace_level))) {\ - char msgStr[1000];\ + char msgStr[2000];\ snprintf(msgStr, 2000, __VA_ARGS__);\ fprintf (stderr, " %ship-%s tid:%d:%s%s", dbName[trace_level]._color, dbName[trace_level]._shortName, tls_tidInfo.tid(), msgStr, KNRM); \ }\ diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index a8324c5729..96fc25c27d 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -65,7 +65,7 @@ int sharePtr(void *ptr, ihipCtx_t *ctx, bool shareWithAll, unsigned hipFlags) if (shareWithAll) { hsa_status_t s = hsa_amd_agents_allow_access(g_deviceCnt+1, g_allAgents, NULL, ptr); - tprintf (DB_MEM, " allow access to CPU + all %d GPUs (shareWithAll)\n", g_deviceCnt); + tprintf (DB_MEM, " allow access to CPU + all %d GPUs (shareWithAll)\n", g_deviceCnt); if (s != HSA_STATUS_SUCCESS) { ret = -1; } @@ -122,7 +122,7 @@ void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, bool if (HIP_INIT_ALLOC != -1) { // TODO , dont' call HIP API directly here: hipMemset(ptr, HIP_INIT_ALLOC, sizeBytes); - } + } if (ptr != nullptr) { int r = sharePtr(ptr, ctx, shareWithAll, hipFlags); @@ -251,7 +251,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) hip_status = hipErrorMemoryAllocation; } - } + } return ihipLogStatus(hip_status); @@ -284,10 +284,10 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) } - const unsigned supportedFlags = hipHostMallocPortable - | hipHostMallocMapped - | hipHostMallocWriteCombined - | hipHostMallocCoherent + const unsigned supportedFlags = hipHostMallocPortable + | hipHostMallocMapped + | hipHostMallocWriteCombined + | hipHostMallocCoherent | hipHostMallocNonCoherent; @@ -300,7 +300,7 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) hip_status = hipErrorInvalidValue; } else { auto device = ctx->getWriteableDevice(); - + unsigned amFlags = 0; if (flags & hipHostMallocCoherent) { amFlags = amHostCoherent; @@ -581,7 +581,7 @@ hipError_t hipMalloc3DArray(hipArray_t *array, hsa_ext_image_data_info_t imageInfo; hsa_status_t status = hsa_ext_image_data_get_info(*agent, &imageDescriptor, permission, &imageInfo); size_t alignment = imageInfo.alignment <= allocGranularity ? 0 : imageInfo.alignment; - + *ptr = hip_internal::allocAndSharePtr("device_array", allocSize, ctx, false, am_flags, 0, alignment); if (size && (*ptr == NULL)) { @@ -1585,7 +1585,7 @@ hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* devPtr){ HIP_INIT_API ( handle, devPtr); hipError_t hipStatus = hipSuccess; // Get the size of allocated pointer - size_t psize; + size_t psize = 0u; hc::accelerator acc; if((handle == NULL) || (devPtr == NULL)) { hipStatus = hipErrorInvalidResourceHandle; diff --git a/src/hip_module.cpp b/src/hip_module.cpp index e9e572af9b..a77ee48a53 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -119,15 +119,18 @@ namespace hipdrv { uint64_t PrintSymbolSizes(const void *emi, const char *name){ using namespace ELFIO; - const Elf64_Ehdr *ehdr = (const Elf64_Ehdr*)emi; + const ELFIO::Elf64_Ehdr *ehdr = (const ELFIO::Elf64_Ehdr*)emi; if(NULL == ehdr || EV_CURRENT != ehdr->e_version){} - const Elf64_Shdr * shdr = (const Elf64_Shdr*)((char*)emi + ehdr->e_shoff); + const ELFIO::Elf64_Shdr * shdr = + (const ELFIO::Elf64_Shdr*)((char*)emi + ehdr->e_shoff); for(uint16_t i=0;ie_shnum;++i){ if(shdr[i].sh_type == SHT_SYMTAB){ - const Elf64_Sym *syms = (const Elf64_Sym*)((char*)emi + shdr[i].sh_offset); + const ELFIO::Elf64_Sym *syms = + (const ELFIO::Elf64_Sym*)((char*)emi + shdr[i].sh_offset); assert(syms); uint64_t numSyms = shdr[i].sh_size/shdr[i].sh_entsize; - const char* strtab = (const char*)((char*)emi + shdr[shdr[i].sh_link].sh_offset); + const char* strtab = + (const char*)((char*)emi + shdr[shdr[i].sh_link].sh_offset); assert(strtab); for(uint64_t i=0;ie_shoff); + const ELFIO::Elf64_Ehdr *ehdr = (const ELFIO::Elf64_Ehdr*)emi; + const ELFIO::Elf64_Shdr *shdr = (const ELFIO::Elf64_Shdr*)((char*)emi + ehdr->e_shoff); uint64_t max_offset = ehdr->e_shoff; uint64_t total_size = max_offset + ehdr->e_shentsize * ehdr->e_shnum; @@ -164,156 +167,8 @@ uint64_t ElfSize(const void *emi){ return total_size; } -namespace -{ - template - inline - ELFIO::section* find_section_if(ELFIO::elfio& reader, P p) - { - using namespace std; - - const auto it = find_if( - reader.sections.begin(), reader.sections.end(), move(p)); - - return it != reader.sections.end() ? *it : nullptr; - } - - inline - std::vector copy_names_of_undefined_symbols( - const ELFIO::symbol_section_accessor& section) - { - using namespace ELFIO; - using namespace std; - - vector r; - - for (auto i = 0u; i != section.get_symbols_num(); ++i) { - // TODO: this is boyscout code, caching the temporaries - // may be of worth. - string name; - Elf64_Addr value = 0; - Elf_Xword size = 0; - Elf_Half sect_idx = 0; - uint8_t bind = 0; - uint8_t type = 0; - uint8_t other = 0; - - section.get_symbol( - i, name, value, size, bind, type, sect_idx, other); - - if (sect_idx == SHN_UNDEF && !name.empty()) { - r.push_back(std::move(name)); - } - } - - return r; - } - - inline - std::pair find_symbol_address( - const ELFIO::symbol_section_accessor& section, - const std::string& symbol_name) - { - using namespace ELFIO; - using namespace std; - - static constexpr pair r{0, 0}; - - for (auto i = 0u; i != section.get_symbols_num(); ++i) { - // TODO: this is boyscout code, caching the temporaries - // may be of worth. - string name; - Elf64_Addr value = 0; - Elf_Xword size = 0; - Elf_Half sect_idx = 0; - uint8_t bind = 0; - uint8_t type = 0; - uint8_t other = 0; - - section.get_symbol( - i, name, value, size, bind, type, sect_idx, other); - - if (name == symbol_name) return make_pair(value, size); - } - - return r; - } - - inline - void associate_code_object_symbols_with_host_allocation( - const ELFIO::elfio& reader, - const ELFIO::elfio& self_reader, - ELFIO::section* code_object_dynsym, - ELFIO::section* process_symtab, - hsa_agent_t agent, - hsa_executable_t executable) - { - using namespace ELFIO; - using namespace std; - - if (!code_object_dynsym || !process_symtab) return; - - const auto undefined_symbols = copy_names_of_undefined_symbols( - symbol_section_accessor{reader, code_object_dynsym}); - - for (auto&& x : undefined_symbols) { - const auto tmp = find_symbol_address( - symbol_section_accessor{self_reader, process_symtab}, x); - - assert(tmp.first); - - void* p = nullptr; - hsa_amd_memory_lock( - reinterpret_cast(tmp.first), tmp.second, &agent, 1, &p); - - hsa_executable_agent_global_variable_define( - executable, agent, x.c_str(), p); - - static vector< - unique_ptr> globals; - static mutex mtx; - - lock_guard lck{mtx}; - globals.emplace_back(p, hsa_amd_memory_unlock); - } - } - - inline - void load_code_object_and_freeze_executable( - const char* file, hsa_agent_t agent, hsa_executable_t executable) - { // TODO: the following sequence is inefficient, should be refactored - // into a single load of the file and subsequent ELFIO - // processing. - using namespace std; - - static const auto cor_deleter = [](hsa_code_object_reader_t* p) { - hsa_code_object_reader_destroy(*p); - }; - - using RAII_code_reader = unique_ptr< - hsa_code_object_reader_t, decltype(cor_deleter)>; - - unique_ptr cobj{fopen(file, "r"), fclose}; - RAII_code_reader tmp{new hsa_code_object_reader_t, cor_deleter}; - hsa_code_object_reader_create_from_file(fileno(cobj.get()), tmp.get()); - - hsa_executable_load_agent_code_object( - executable, agent, *tmp, nullptr, nullptr); - - hsa_executable_freeze(executable, nullptr); - - static vector code_readers; - static mutex mtx; - - lock_guard lck{mtx}; - code_readers.push_back(move(tmp)); - } -} - hipError_t hipModuleLoad(hipModule_t *module, const char *fname) { - using namespace ELFIO; - HIP_INIT_API(module, fname); hipError_t ret = hipSuccess; *module = new ihipModule_t; @@ -336,36 +191,14 @@ hipError_t hipModuleLoad(hipModule_t *module, const char *fname) nullptr, &(*module)->executable); - elfio reader; - if (!reader.load(fname)) { + std::ifstream file{fname}; + + if (!file.is_open()) { return ihipLogStatus(hipErrorFileNotFound); } - else { - // TODO: this may benefit from caching as well. - elfio self_reader; - self_reader.load("/proc/self/exe"); - - const auto symtab = - find_section_if(self_reader, [](const ELFIO::section* x) { - return x->get_type() == SHT_SYMTAB; - }); - - const auto code_object_dynsym = - find_section_if(reader, [](const ELFIO::section* x) { - return x->get_type() == SHT_DYNSYM; - }); - - associate_code_object_symbols_with_host_allocation( - reader, - self_reader, - code_object_dynsym, - symtab, - currentDevice->_hsaAgent, - (*module)->executable); - - load_code_object_and_freeze_executable( - fname, currentDevice->_hsaAgent, (*module)->executable); - } + (*module)->executable = hip_impl::load_executable( + (*module)->executable, currentDevice->_hsaAgent, file); + ret = (*module)->executable.handle ? hipSuccess : hipErrorUnknown; } return ihipLogStatus(ret); diff --git a/src/program_state.cpp b/src/program_state.cpp new file mode 100644 index 0000000000..be871a6e84 --- /dev/null +++ b/src/program_state.cpp @@ -0,0 +1,498 @@ +#include "../include/hip/hcc_detail/program_state.hpp" + +#include "../include/hip/hcc_detail/code_object_bundle.hpp" + +#include "hip_hcc_internal.h" +#include "trace_helper.h" + +#include "elfio/elfio.hpp" + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace ELFIO; +using namespace hip_impl; +using namespace std; + +namespace std +{ + template<> + struct hash { + size_t operator()(hsa_agent_t x) const + { + return hash{}(x.handle); + } + }; + + template<> + struct hash { + size_t operator()(hsa_isa_t x) const + { + return hash{}(x.handle); + } + }; +} + +inline +constexpr +bool operator==(hsa_agent_t x, hsa_agent_t y) +{ + return x.handle == y.handle; +} + +inline +constexpr +bool operator==(hsa_isa_t x, hsa_isa_t y) +{ + return x.handle == y.handle; +} + +namespace +{ + vector copy_names_of_undefined_symbols( + const symbol_section_accessor& section) + { + vector r; + + for (auto i = 0u; i != section.get_symbols_num(); ++i) { + // TODO: this is boyscout code, caching the temporaries + // may be of worth. + string name; + Elf64_Addr value = 0; + Elf_Xword size = 0; + Elf_Half sect_idx = 0; + uint8_t bind = 0; + uint8_t type = 0; + uint8_t other = 0; + + section.get_symbol( + i, name, value, size, bind, type, sect_idx, other); + + if (sect_idx == SHN_UNDEF && !name.empty()) { + r.push_back(std::move(name)); + } + } + + return r; + } + + pair find_symbol_address( + const symbol_section_accessor& section, + const string& symbol_name) + { + static constexpr pair r{0, 0}; + + for (auto i = 0u; i != section.get_symbols_num(); ++i) { + // TODO: this is boyscout code, caching the temporaries + // may be of worth. + string name; + Elf64_Addr value = 0; + Elf_Xword size = 0; + Elf_Half sect_idx = 0; + uint8_t bind = 0; + uint8_t type = 0; + uint8_t other = 0; + + section.get_symbol( + i, name, value, size, bind, type, sect_idx, other); + + if (name == symbol_name) return make_pair(value, size); + } + + return r; + } + + void associate_code_object_symbols_with_host_allocation( + const elfio& reader, + const elfio& self_reader, + section* code_object_dynsym, + section* process_symtab, + hsa_agent_t agent, + hsa_executable_t executable) + { + if (!code_object_dynsym || !process_symtab) return; + + const auto undefined_symbols = copy_names_of_undefined_symbols( + symbol_section_accessor{reader, code_object_dynsym}); + + for (auto&& x : undefined_symbols) { + const auto tmp = find_symbol_address( + symbol_section_accessor{self_reader, process_symtab}, x); + + assert(tmp.first); + + void* p = nullptr; + hsa_amd_memory_lock( + reinterpret_cast(tmp.first), tmp.second, &agent, 1, &p); + + hsa_executable_agent_global_variable_define( + executable, agent, x.c_str(), p); + + static vector< + unique_ptr> globals; + static mutex mtx; + + lock_guard lck{mtx}; + globals.emplace_back(p, hsa_amd_memory_unlock); + } + } + + template + inline + section* find_section_if(elfio& reader, P p) + { + const auto it = find_if( + reader.sections.begin(), reader.sections.end(), std::move(p)); + + return it != reader.sections.end() ? *it : nullptr; + } + + vector code_object_blob_for_process() + { + static constexpr const char self[] = "/proc/self/exe"; + static constexpr const char kernel_section[] = ".kernel"; + + elfio reader; + + if (!reader.load(self)) { + throw runtime_error{"Failed to load ELF file for current process."}; + } + + auto kernels = find_section_if(reader, [](const section* x) { + return x->get_name() == kernel_section; + }); + + vector r; + if (kernels) { + r.insert( + r.end(), + kernels->get_data(), + kernels->get_data() + kernels->get_size()); + } + + return r; + } + + const unordered_map>>& code_object_blobs() + { + static unordered_map>> r; + static once_flag f; + + call_once(f, []() { + static vector> blobs{ + code_object_blob_for_process()}; + + dl_iterate_phdr([](dl_phdr_info* i, std::size_t, void*) { + elfio tmp; + if (tmp.load(i->dlpi_name)) { + const auto it = find_section_if(tmp, [](const section* x) { + return x->get_name() == ".kernel"; + }); + + if (it) blobs.emplace_back( + it->get_data(), it->get_data() + it->get_size()); + } + return 0; + }, nullptr); + + for (auto&& blob : blobs) { + Bundled_code_header tmp{blob}; + if (valid(tmp)) { + for (auto&& bundle : bundles(tmp)) { + r[triple_to_hsa_isa(bundle.triple)] + .push_back(bundle.blob); + } + } + } + }); + + return r; + } + + const unordered_map>& executables() + { + static unordered_map> r; + static once_flag f; + + call_once(f, []() { + static const auto accelerators = hc::accelerator::get_all(); + + for (auto&& acc : accelerators) { + auto agent = static_cast(acc.get_hsa_agent()); + + if (!agent) continue; + + hsa_agent_iterate_isas(*agent, [](hsa_isa_t x, void* pa) { + const auto it = code_object_blobs().find(x); + + if (it != code_object_blobs().cend()) { + hsa_agent_t a = *static_cast(pa); + + for (auto&& blob : it->second) { + hsa_executable_t tmp = {}; + + hsa_executable_create_alt( + HSA_PROFILE_FULL, + HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, + nullptr, + &tmp); + + // TODO: this is massively inefficient and only + // meant for illustration. + string blob_to_str{blob.cbegin(), blob.cend()}; + stringstream istr{blob_to_str}; + tmp = load_executable(tmp, a, istr); + + if (tmp.handle) r[a].push_back(tmp); + } + } + + return HSA_STATUS_SUCCESS; + }, agent); + } + }); + + cout << r.size() << endl; + return r; + } + + inline + hsa_agent_t agent(hsa_executable_symbol_t x) + { + hsa_agent_t r = {}; + hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_AGENT, &r); + + return r; + } + + inline + uint32_t group_size(hsa_executable_symbol_t x) + { + uint32_t r = 0u; + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &r); + + return r; + } + + inline + uint64_t kernel_object(hsa_executable_symbol_t x) + { + uint64_t r = 0u; + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &r); + + return r; + } + + inline + string name(hsa_executable_symbol_t x) + { + uint32_t sz = 0u; + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &sz); + + string r(sz, '\0'); + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_NAME, &r.front()); + + return r; + } + + inline + uint32_t private_size(hsa_executable_symbol_t x) + { + uint32_t r = 0u; + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &r); + + return r; + } + + inline + hsa_symbol_kind_t type(hsa_executable_symbol_t x) + { + hsa_symbol_kind_t r = {}; + hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &r); + + return r; + } + + const unordered_map>& kernels() + { + static unordered_map> r; + static once_flag f; + + call_once(f, []() { + static const auto copy_kernels = []( + hsa_executable_t, hsa_agent_t, hsa_executable_symbol_t s, void*) { + if (type(s) == HSA_SYMBOL_KIND_KERNEL) r[name(s)].push_back(s); + + return HSA_STATUS_SUCCESS; + }; + + for (auto&& agent_executables : executables()) { + for (auto&& executable : agent_executables.second) { + hsa_executable_iterate_agent_symbols( + executable, + agent_executables.first, + copy_kernels, + nullptr); + } + } + }); + + return r; + } + + void load_code_object_and_freeze_executable( + istream& file, hsa_agent_t agent, hsa_executable_t executable) + { // TODO: the following sequence is inefficient, should be refactored + // into a single load of the file and subsequent ELFIO + // processing. + static const auto cor_deleter = [](hsa_code_object_reader_t* p) { + hsa_code_object_reader_destroy(*p); + }; + + using RAII_code_reader = unique_ptr< + hsa_code_object_reader_t, decltype(cor_deleter)>; + + file.seekg(0); + + vector blob{ + istreambuf_iterator{file}, istreambuf_iterator{}}; + RAII_code_reader tmp{new hsa_code_object_reader_t, cor_deleter}; + hsa_code_object_reader_create_from_memory( + blob.data(), blob.size(), tmp.get()); + + hsa_executable_load_agent_code_object( + executable, agent, *tmp, nullptr, nullptr); + + hsa_executable_freeze(executable, nullptr); + + static vector code_readers; + static mutex mtx; + + lock_guard lck{mtx}; + code_readers.push_back(move(tmp)); + } +} + +namespace hip_impl +{ + const unordered_map& function_names() + { + static constexpr const char self[] = "/proc/self/exe"; + + static unordered_map r; + static once_flag f; + + call_once(f, []() { + elfio reader; + + if (!reader.load(self)) { + throw runtime_error{ + "Failed to load the ELF file for the current process."}; + } + + auto symtab = find_section_if(reader, [](const section* x) { + return x->get_type() == SHT_SYMTAB; + }); + + symbol_section_accessor symbols{reader, symtab}; + + for (auto i = 0u; i != symbols.get_symbols_num(); ++i) { + // TODO: this is boyscout code, caching the temporaries + // may be of worth. + string name; + Elf64_Addr value = 0; + Elf_Xword size = 0; + Elf_Half sect_idx = 0; + uint8_t bind = 0; + uint8_t type = 0; + uint8_t other = 0; + + symbols.get_symbol( + i, name, value, size, bind, type, sect_idx, other); + + if (type == STT_FUNC && sect_idx != SHN_UNDEF && !name.empty()) { + r.emplace(value, name); + } + } + }); + + return r; + } + + const unordered_map< + uintptr_t, vector>>& functions() + { + static unordered_map< + uintptr_t, vector>> r; + static once_flag f; + + call_once(f, []() { + for (auto&& function : function_names()) { + const auto it = kernels().find(function.second); + + if (it != kernels().cend()) { + for (auto&& kernel_symbol : it->second) { + r[function.first].emplace_back( + agent(kernel_symbol), + Kernel_descriptor{ + kernel_object(kernel_symbol), + group_size(kernel_symbol), + private_size(kernel_symbol), + it->first}); + } + } + } + }); + + return r; + } + + hsa_executable_t load_executable( + hsa_executable_t executable, hsa_agent_t agent, istream& file) + { + elfio reader; + if (!reader.load(file)) { + return hsa_executable_t{}; + } + else { + // TODO: this may benefit from caching as well. + elfio self_reader; + self_reader.load("/proc/self/exe"); + + const auto symtab = + find_section_if(self_reader, [](const ELFIO::section* x) { + return x->get_type() == SHT_SYMTAB; + }); + + const auto code_object_dynsym = + find_section_if(reader, [](const ELFIO::section* x) { + return x->get_type() == SHT_DYNSYM; + }); + + associate_code_object_symbols_with_host_allocation( + reader, self_reader, code_object_dynsym, symtab, agent, executable); + + load_code_object_and_freeze_executable(file, agent, executable); + + return executable; + } + } +} // Namespace hip_impl. \ No newline at end of file diff --git a/tests/src/context/hipMemsetD8.cpp b/tests/src/context/hipMemsetD8.cpp index 3730fcb70b..a356d05b76 100644 --- a/tests/src/context/hipMemsetD8.cpp +++ b/tests/src/context/hipMemsetD8.cpp @@ -46,7 +46,6 @@ int main(int argc, char *argv[]) A_h = new char[Nbytes]; HIPCHECK ( hipMalloc((void **) &A_d, Nbytes) ); - A_h = (char*)malloc(Nbytes); printf ("Size=%zu memsetval=%2x \n", Nbytes, memsetval); HIPCHECK ( hipMemsetD8(A_d, memsetval, Nbytes) ); @@ -61,7 +60,7 @@ int main(int argc, char *argv[]) } hipFree((void *) A_d); - free(A_h); + delete [] A_h; passed(); } diff --git a/tests/src/deviceLib/hipTestDevice.cpp b/tests/src/deviceLib/hipTestDevice.cpp index 570f3baaf0..fa85940839 100644 --- a/tests/src/deviceLib/hipTestDevice.cpp +++ b/tests/src/deviceLib/hipTestDevice.cpp @@ -139,7 +139,14 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); + if(passed == 1){ return true; } @@ -174,7 +181,14 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); + if(passed == 1){ return true; } @@ -205,7 +219,13 @@ for(int i=0;i<512;i++){ } } -free(A); +delete [] A; +delete [] B; +delete [] C; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); + if(passed == 1){ return true; } @@ -234,7 +254,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -263,7 +288,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -291,7 +321,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -321,7 +356,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -350,7 +390,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -387,7 +432,16 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +delete [] D; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); +hipFree(Dd); + if(passed == 1){ return true; } @@ -427,7 +481,18 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +delete [] D; +delete [] E; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); +hipFree(Dd); +hipFree(Ed); + if(passed == 1){ return true; } @@ -457,7 +522,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -489,7 +559,14 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); + if(passed == 1){ return true; } @@ -525,7 +602,16 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +delete [] D; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); +hipFree(Dd); + if(passed == 1){ return true; } @@ -565,7 +651,18 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +delete [] D; +delete [] E; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); +hipFree(Dd); +hipFree(Ed); + if(passed == 1){ return true; } @@ -595,7 +692,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -622,7 +724,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -631,7 +738,7 @@ return false; } int main(){ -if(run_sincosf() && run_sincospif() && run_fdividef() && +if(run_sincosf() && run_sincospif() && run_fdividef() && run_llrintf() && run_norm3df() && run_norm4df() && run_normf() && run_rnorm3df() && run_rnorm4df() && run_rnormf() && run_lroundf() && run_llroundf() && diff --git a/tests/src/deviceLib/hipTestDeviceDouble.cpp b/tests/src/deviceLib/hipTestDeviceDouble.cpp index 5bdbbf1b8f..3b919d0cab 100644 --- a/tests/src/deviceLib/hipTestDeviceDouble.cpp +++ b/tests/src/deviceLib/hipTestDeviceDouble.cpp @@ -128,7 +128,14 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); + if(passed == 1){ return true; } @@ -163,7 +170,14 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); + if(passed == 1){ return true; } @@ -193,7 +207,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -221,7 +240,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -249,7 +273,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -278,7 +307,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -306,7 +340,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -343,7 +382,16 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +delete [] D; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); +hipFree(Dd); + if(passed == 1){ return true; } @@ -383,7 +431,18 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +delete [] D; +delete [] E; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); +hipFree(Dd); +hipFree(Ed); + if(passed == 1){ return true; } @@ -416,7 +475,14 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); + if(passed == 1){ return true; } @@ -452,7 +518,16 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +delete [] D; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); +hipFree(Dd); + if(passed == 1){ return true; } @@ -492,7 +567,18 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +delete [] D; +delete [] E; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); +hipFree(Dd); +hipFree(Ed); + if(passed == 1){ return true; } @@ -522,7 +608,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -549,7 +640,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } diff --git a/tests/src/deviceLib/hip_test_ldg.cpp b/tests/src/deviceLib/hip_test_ldg.cpp index 171ff1afd0..5540c4917d 100644 --- a/tests/src/deviceLib/hip_test_ldg.cpp +++ b/tests/src/deviceLib/hip_test_ldg.cpp @@ -159,11 +159,16 @@ bool dataTypesRun(){ HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice)); - hipLaunchKernel(vectoradd_float, - dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), - dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), - 0, 0, - deviceA ,deviceB ,WIDTH ,HEIGHT); + hipLaunchKernel( + vectoradd_float, + dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), + 0, + 0, + deviceA, + static_cast(deviceB), + WIDTH, + HEIGHT); HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(T), hipMemcpyDeviceToHost)); @@ -221,11 +226,16 @@ bool dataTypesRun2(){ HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T))); HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice)); - hipLaunchKernel(vectoradd_float, - dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), - dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), - 0, 0, - deviceA ,deviceB,WIDTH ,HEIGHT); + hipLaunchKernel( + vectoradd_float, + dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), + 0, + 0, + deviceA, + static_cast(deviceB), + WIDTH, + HEIGHT); HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(T), hipMemcpyDeviceToHost)); @@ -281,11 +291,16 @@ bool dataTypesRun4(){ HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice)); - hipLaunchKernel(vectoradd_float, - dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), - dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), - 0, 0, - deviceA ,deviceB ,WIDTH ,HEIGHT); + hipLaunchKernel( + vectoradd_float, + dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), + 0, + 0, + deviceA, + static_cast(deviceB), + WIDTH, + HEIGHT); HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(T), hipMemcpyDeviceToHost)); diff --git a/tests/src/experimental/xcompile/hipxxKer.cpp b/tests/src/experimental/xcompile/hipxxKer.cpp index 79a272aaf2..d1bbed63cd 100644 --- a/tests/src/experimental/xcompile/hipxxKer.cpp +++ b/tests/src/experimental/xcompile/hipxxKer.cpp @@ -36,17 +36,23 @@ __global__ void Kern(hipLaunchParm lp, float *A) int main() { - float *A, *Ad; + float A[len]; + float *Ad; + for(int i=0;i(A_d), + static_cast(B_d), + C_d, + N); HIPCHECK (hipEventRecord(stop, NULL)); diff --git a/tests/src/runtimeApi/event/record_event.cpp b/tests/src/runtimeApi/event/record_event.cpp index bd8a3ada8e..a7b99749cb 100644 --- a/tests/src/runtimeApi/event/record_event.cpp +++ b/tests/src/runtimeApi/event/record_event.cpp @@ -52,7 +52,7 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_ if (!(testMask & p_tests)) { return; } - printf ("\ntest 0x%3x: stream=%p waitStart=%d syncMode=%s\n", + printf ("\ntest 0x%3x: stream=%p waitStart=%d syncMode=%s\n", testMask, stream, waitStart, syncModeString(syncMode)); size_t sizeBytes = numElements * sizeof(int); @@ -77,7 +77,16 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_ HIPCHECK(hipEventRecord(timingDisabled, stream)); // sandwhich a kernel: HIPCHECK(hipEventRecord(start, stream)); - hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, stream, C_d, C_h, numElements, count); + hipLaunchKernelGGL( + HipTest::addCountReverse, + dim3(blocks), + dim3(threadsPerBlock), + 0, + stream, + static_cast(C_d), + C_h, + numElements, + count); HIPCHECK(hipEventRecord(stop, stream)); @@ -85,8 +94,8 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_ HIPCHECK(hipEventSynchronize(start)); } - - hipError_t expectedStopError = hipSuccess; + + hipError_t expectedStopError = hipSuccess; // How to wait for the events to finish: switch (syncMode) { @@ -97,12 +106,12 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_ HIPCHECK(hipStreamSynchronize(stream)); // wait for recording to finish... break; case syncStopEvent: - HIPCHECK(hipEventSynchronize(stop)); + HIPCHECK(hipEventSynchronize(stop)); break; default: assert(0); }; - + float t; @@ -111,25 +120,25 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_ failed ("start event not in expected state, was %d=%s\n", e, hipGetErrorName(e)); } - if (e == hipSuccess) + if (e == hipSuccess) assert (t==0.0f); - + // stop usually ready unless we skipped the synchronization (syncNone) HIPCHECK_API(hipEventElapsedTime(&t, stop, stop), expectedStopError); - if (e == hipSuccess) + if (e == hipSuccess) assert (t==0.0f); e = hipEventElapsedTime(&t, start, stop); HIPCHECK_API(e, expectedStopError); - if (expectedStopError == hipSuccess) + if (expectedStopError == hipSuccess) assert (t>0.0f); printf ("time=%6.2f error=%s\n", t, hipGetErrorName(e)); e = hipEventElapsedTime(&t, stop, start); HIPCHECK_API(e, expectedStopError); - if (expectedStopError == hipSuccess) + if (expectedStopError == hipSuccess) assert (t<0.0f); printf ("negtime=%6.2f error=%s\n", t, hipGetErrorName(e)); diff --git a/tests/src/runtimeApi/memory/hipMemcpy.cpp b/tests/src/runtimeApi/memory/hipMemcpy.cpp index e8e803e44c..b3f25658fc 100644 --- a/tests/src/runtimeApi/memory/hipMemcpy.cpp +++ b/tests/src/runtimeApi/memory/hipMemcpy.cpp @@ -58,7 +58,7 @@ public: void offset(int offset) { _offset = offset; }; int offset() const { return _offset; }; - + private: T * _A_d; T* _B_d; @@ -72,7 +72,7 @@ private: template DeviceMemory::DeviceMemory(size_t numElements) - : _maxNumElements(numElements), + : _maxNumElements(numElements), _offset(0) { T ** np = nullptr; @@ -93,7 +93,7 @@ DeviceMemory::~DeviceMemory () HipTest::freeArrays (_A_d, _B_d, _C_d, np, np, np, 0); HIPCHECK (hipFree(_C_dd)); - + _C_dd = NULL; }; @@ -125,7 +125,7 @@ public: T * A_hh; T* B_hh; - bool _usePinnedHost; + bool _usePinnedHost; private: size_t _maxNumElements; @@ -165,11 +165,11 @@ HostMemory::HostMemory(size_t numElements, bool usePinnedHost) template void -HostMemory::reset(size_t numElements, bool full) +HostMemory::reset(size_t numElements, bool full) { // Initialize the host data: for (size_t i=0; i void memcpytest2(DeviceMemory *dmem, HostMemory *hmem, size_t numElements, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) { size_t sizeElements = numElements * sizeof(T); - printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d, offsets:dev:%+d host:+%d\n", - __func__, + printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d, offsets:dev:%+d host:+%d\n", + __func__, TYPENAME(T), sizeElements, sizeElements/1024.0/1024.0, hmem->_usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault, @@ -243,7 +243,16 @@ void memcpytest2(DeviceMemory *dmem, HostMemory *hmem, size_t numElements, HIPCHECK ( hipMemcpy(dmem->B_d(), hmem->B_h(), sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); } - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, dmem->A_d(), dmem->B_d(), dmem->C_d(), numElements); + hipLaunchKernel( + HipTest::vectorADD, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(dmem->A_d()), + static_cast(dmem->B_d()), + dmem->C_d(), + numElements); if (useDeviceToDevice) { // Do an extra device-to-device copy here to mix things up: @@ -273,8 +282,8 @@ void memcpytest2_for_type(size_t numElements) { printSep(); - DeviceMemory memD(numElements); - HostMemory memU(numElements, 0/*usePinnedHost*/); + DeviceMemory memD(numElements); + HostMemory memU(numElements, 0/*usePinnedHost*/); HostMemory memP(numElements, 1/*usePinnedHost*/); for (int usePinnedHost =0; usePinnedHost<=1; usePinnedHost++) { @@ -307,11 +316,11 @@ void memcpytest2_sizes(size_t maxElem=0) maxElem = free/sizeof(T)/8; } - printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n", + printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n", deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0); HIPCHECK ( hipDeviceReset() ); - DeviceMemory memD(maxElem); - HostMemory memU(maxElem, 0/*usePinnedHost*/); + DeviceMemory memD(maxElem); + HostMemory memU(maxElem, 0/*usePinnedHost*/); HostMemory memP(maxElem, 1/*usePinnedHost*/); for (size_t elem=1; elem<=maxElem; elem*=2) { @@ -336,11 +345,11 @@ void memcpytest2_offsets(size_t maxElem, bool devOffsets, bool hostOffsets) HIPCHECK(hipMemGetInfo(&free, &total)); - printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n", + printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n", deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0); HIPCHECK ( hipDeviceReset() ); - DeviceMemory memD(maxElem); - HostMemory memU(maxElem, 0/*usePinnedHost*/); + DeviceMemory memD(maxElem); + HostMemory memU(maxElem, 0/*usePinnedHost*/); HostMemory memP(maxElem, 1/*usePinnedHost*/); size_t elem = maxElem / 2; @@ -380,16 +389,16 @@ void multiThread_1(bool serialize, bool usePinnedHost) { printSep(); printf ("test: %s<%s> serialize=%d usePinnedHost=%d\n", __func__, TYPENAME(T), serialize, usePinnedHost); - DeviceMemory memD(N); - HostMemory mem1(N, usePinnedHost); - HostMemory mem2(N, usePinnedHost); + DeviceMemory memD(N); + HostMemory mem1(N, usePinnedHost); + HostMemory mem2(N, usePinnedHost); std::thread t1 (memcpytest2, &memD, &mem1, N, 0,0,0); if (serialize) { t1.join(); } - + std::thread t2 (memcpytest2,&memD, &mem2, N, 0,0,0); if (serialize) { t2.join(); @@ -427,21 +436,21 @@ int main(int argc, char *argv[]) // Some tests around the 64KB boundary which have historically shown issues: printf ("\n\n=== tests&0x2 (64KB boundary)\n"); size_t maxElem = 32*1024*1024; - DeviceMemory memD(maxElem); - HostMemory memU(maxElem, 0/*usePinnedHost*/); - HostMemory memP(maxElem, 0/*usePinnedHost*/); + DeviceMemory memD(maxElem); + HostMemory memU(maxElem, 0/*usePinnedHost*/); + HostMemory memP(maxElem, 0/*usePinnedHost*/); // These all pass: - memcpytest2(&memD, &memP, 15*1024*1024, 0, 0, 0); - memcpytest2(&memD, &memP, 16*1024*1024, 0, 0, 0); - memcpytest2(&memD, &memP, 16*1024*1024+16*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 15*1024*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 16*1024*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 16*1024*1024+16*1024, 0, 0, 0); // Just over 64MB: - memcpytest2(&memD, &memP, 16*1024*1024+512*1024, 0, 0, 0); - memcpytest2(&memD, &memP, 17*1024*1024+1024, 0, 0, 0); - memcpytest2(&memD, &memP, 32*1024*1024, 0, 0, 0); - memcpytest2(&memD, &memU, 32*1024*1024, 0, 0, 0); - memcpytest2(&memD, &memP, 32*1024*1024, 1, 1, 0); - memcpytest2(&memD, &memP, 32*1024*1024, 1, 1, 0); + memcpytest2(&memD, &memP, 16*1024*1024+512*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 17*1024*1024+1024, 0, 0, 0); + memcpytest2(&memD, &memP, 32*1024*1024, 0, 0, 0); + memcpytest2(&memD, &memU, 32*1024*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 32*1024*1024, 1, 1, 0); + memcpytest2(&memD, &memP, 32*1024*1024, 1, 1, 0); } @@ -464,7 +473,7 @@ int main(int argc, char *argv[]) // Simplest cases: serialize the threads, and also used pinned memory: // This verifies that the sub-calls to memcpytest2 are correct. - multiThread_1(true, true); + multiThread_1(true, true); // Serialize, but use unpinned memory to stress the unpinned memory xfer path. multiThread_1(true, false); diff --git a/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp b/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp index 316f50c01b..9a09e7e95c 100644 --- a/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp +++ b/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp @@ -63,7 +63,16 @@ void simpleTest1() HIPCHECK ( memcopy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); HIPCHECK ( memcopy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N); + hipLaunchKernel( + HipTest::vectorADD, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(A_d), + static_cast(B_d), + C_d, + N); HIPCHECK ( memcopy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); diff --git a/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp b/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp index 4f73b67ad7..9d274543ab 100644 --- a/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp +++ b/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp @@ -41,8 +41,8 @@ void printSep() // Designed to stress a small number of simple smoke tests template< - typename T=float, - class P=HipTest::Unpinned, + typename T=float, + class P=HipTest::Unpinned, class C=HipTest::Memcpy > void simpleVectorAdd(size_t numElements, int iters, hipStream_t stream) @@ -90,7 +90,16 @@ void simpleVectorAdd(size_t numElements, int iters, hipStream_t stream) // This is the null stream? //hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); - hipLaunchKernel(HipTest::vectorADDReverse, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); + hipLaunchKernel( + HipTest::vectorADDReverse, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(A_d), + static_cast(B_d), + C_d, + numElements); MemTraits::Copy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, stream); diff --git a/tests/src/runtimeApi/stream/hipNullStream.cpp b/tests/src/runtimeApi/stream/hipNullStream.cpp index b610315608..04a232f3bb 100644 --- a/tests/src/runtimeApi/stream/hipNullStream.cpp +++ b/tests/src/runtimeApi/stream/hipNullStream.cpp @@ -119,7 +119,7 @@ void Streamer::reset() { HipTest::setDefaultData(_numElements, _A_h, _B_h, _C_h); H2D(); - + } @@ -128,7 +128,17 @@ void Streamer::enqueAsync() { printf ("testing: %s numElements=%zu size=%6.2fMB\n", __func__, _numElements, _numElements * sizeof(T) / 1024.0/1024.0); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements); - hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _B_d, _C_d, _numElements, p_repeat); + hipLaunchKernel( + vectorADDRepeat, + dim3(blocks), + dim3(threadsPerBlock), + 0, + _stream, + static_cast(_A_d), + static_cast(_B_d), + _C_d, + _numElements, + p_repeat); } @@ -225,7 +235,17 @@ int main(int argc, char *argv[]) auto lastStreamer = streamers[s - 1]; // Dispatch to NULL stream, should wait for prior async activity to complete before beginning: - hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/); + hipLaunchKernel( + vectorADDRepeat, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0/*nullstream*/, + static_cast(lastStreamer->_C_d), + static_cast(lastStreamer->_C_d), + nullStreamer->_C_d, + numElements, + 1/*repeat*/); if (p_db) { @@ -238,7 +258,7 @@ int main(int argc, char *argv[]) nullStreamer->D2H(); HIPCHECK(hipDeviceSynchronize()); - HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); + HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); } } @@ -257,13 +277,23 @@ int main(int argc, char *argv[]) auto lastStreamer = streamers[s - 1]; // Dispatch to NULL stream, should wait for prior async activity to complete before beginning: - hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/); + hipLaunchKernel( + vectorADDRepeat, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0/*nullstream*/, + static_cast(lastStreamer->_C_d), + static_cast(lastStreamer->_C_d), + nullStreamer->_C_d, + numElements, + 1/*repeat*/); nullStreamer->D2H(); HIPCHECK(hipDeviceSynchronize()); - HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); + HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); } } @@ -289,10 +319,10 @@ int main(int argc, char *argv[]) // Copy with stream1, this could go async if the streamSync doesn't synchronize ALL the streams. HIPCHECK(hipMemcpyAsync(streamers[0]->_C_h, streamers[0]->_C_d, streamers[0]->_numElements*sizeof(int), hipMemcpyDeviceToHost, streamers[1]->_stream)); - + HIPCHECK(hipDeviceSynchronize()); - HipTest::checkTest(expected_H, streamers[0]->_C_h, numElements); + HipTest::checkTest(expected_H, streamers[0]->_C_h, numElements); } diff --git a/tests/src/runtimeApi/stream/hipStreamSync2.cpp b/tests/src/runtimeApi/stream/hipStreamSync2.cpp index c6a58ce7d4..962737774d 100644 --- a/tests/src/runtimeApi/stream/hipStreamSync2.cpp +++ b/tests/src/runtimeApi/stream/hipStreamSync2.cpp @@ -59,23 +59,23 @@ const char *syncModeString(int syncMode) { void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode syncMode, bool expectMismatch) { - // This test sends a long-running kernel to the null stream, then tests to see if the + // This test sends a long-running kernel to the null stream, then tests to see if the // specified synchronization technique is effective. // - // Some syncMode are not expected to correctly sync (for example "syncNone"). in these + // Some syncMode are not expected to correctly sync (for example "syncNone"). in these // cases the test sets expectMismatch and the check logic below will attempt to ensure that // the undesired synchronization did not occur - ie ensure the kernel is still running and did // not yet update the stop event. This can be tricky since if the kernel runs fast enough it - // may complete before the check. To prevent this, the addCountReverse has a count parameter - // which causes it to loop repeatedly, and the results are checked in reverse order. + // may complete before the check. To prevent this, the addCountReverse has a count parameter + // which causes it to loop repeatedly, and the results are checked in reverse order. // // Tests with expectMismatch=true should ensure the kernel finishes correctly. This results // are checked and we test to make sure stop event has completed. - + if (!(testMask & p_tests)) { return; } - printf ("\ntest 0x%02x: syncMode=%s expectMismatch=%d\n", + printf ("\ntest 0x%02x: syncMode=%s expectMismatch=%d\n", testMask, syncModeString(syncMode), expectMismatch); size_t sizeBytes = numElements * sizeof(int); @@ -97,8 +97,17 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode s unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); // Launch kernel into null stream, should result in C_h == count. - hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, 0 /*stream*/, C_d, C_h, numElements, count); - HIPCHECK(hipEventRecord(stop, 0/*default*/)); + hipLaunchKernelGGL( + HipTest::addCountReverse, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0 /*stream*/, + static_cast(C_d), + C_h, + numElements, + count); + HIPCHECK(hipEventRecord(stop, 0/*default*/)); switch (syncMode) { case syncNone: @@ -108,18 +117,18 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode s break; case syncOtherStream: // Does this synchronize with the null stream? - HIPCHECK(hipStreamSynchronize(otherStream)); + HIPCHECK(hipStreamSynchronize(otherStream)); break; case syncMarkerThenOtherStream: case syncMarkerThenOtherNonBlockingStream: - - // this may wait for NULL stream depending hipStreamNonBlocking flag above - HIPCHECK(hipEventRecord(otherStreamEvent, otherStream)); - HIPCHECK(hipStreamSynchronize(otherStream)); + // this may wait for NULL stream depending hipStreamNonBlocking flag above + HIPCHECK(hipEventRecord(otherStreamEvent, otherStream)); + + HIPCHECK(hipStreamSynchronize(otherStream)); break; case syncDevice: - HIPCHECK(hipDeviceSynchronize()); + HIPCHECK(hipDeviceSynchronize()); break; default: assert(0); @@ -197,7 +206,7 @@ void runTests(int64_t numElements) int main(int argc, char *argv[]) { // Can' destroy the default stream:// TODO - move to another test - HIPCHECK_API(hipStreamDestroy(0), hipErrorInvalidResourceHandle); + HIPCHECK_API(hipStreamDestroy(0), hipErrorInvalidResourceHandle); HipTest::parseStandardArguments(argc, argv, true /*failOnUndefinedArg*/); diff --git a/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp b/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp index f5b1b79550..a7a930b4f6 100644 --- a/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp +++ b/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp @@ -88,7 +88,7 @@ private: template Streamer::Streamer(int deviceId, T * A_d, size_t numElements, int commandType) : - _preA_d(NULL), + _preA_d(NULL), _A_d(A_d), _deviceId(deviceId), _numElements(numElements), @@ -163,9 +163,27 @@ void Streamer::runAsyncAfter(Streamer *depStreamer, bool waitSameStream) unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements); if (_commandType == COMMAND_ADD_REVERSE) { - hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); + hipLaunchKernelGGL( + HipTest::addCountReverse, + dim3(blocks), + dim3(threadsPerBlock), + 0, + _stream, + static_cast(_A_d), + _C_d, + static_cast(_numElements), + static_cast(p_count)); } else if (_commandType == COMMAND_ADD_FORWARD) { - hipLaunchKernelGGL(HipTest::addCount, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); + hipLaunchKernelGGL( + HipTest::addCount, + dim3(blocks), + dim3(threadsPerBlock), + 0, + _stream, + static_cast(_A_d), + _C_d, + _numElements, + static_cast(p_count)); } else if (_commandType == COMMAND_COPY) { HIPCHECK(hipMemcpyAsync(_C_d, _A_d, _numElements * sizeof(T), hipMemcpyDeviceToDevice, _stream)); } else { @@ -239,7 +257,7 @@ size_t Streamer::check(int streamerNum, T initValue, T expectedOffset, bool e return _mismatchCount; } - + //--- //Parse arguments specific to this test. @@ -300,7 +318,7 @@ void checkAll(int initValue, std::vector &streamers, std::vector< for (int i=0; iexpectedAdd(); - + mismatchCount += streamers[i]->check(i+1, initValue, expected, expectPass); } @@ -330,7 +348,7 @@ void checkAll(int initValue, std::vector &streamers, std::vector< void sync_none(void) {}; -void sync_allDevices(int numDevices) +void sync_allDevices(int numDevices) { for (int d=0; d streamers) +void sync_queryAllUntilComplete(std::vector streamers) { for (int i=streamers.size()-1; i>=0; i--) { streamers[i]->queryUntilComplete(); @@ -347,7 +365,7 @@ void sync_queryAllUntilComplete(std::vector streamers) } -void sync_streamWaitEvent(hipEvent_t lastEvent, int sideDeviceId, hipStream_t sideStream, bool waitHere) +void sync_streamWaitEvent(hipEvent_t lastEvent, int sideDeviceId, hipStream_t sideStream, bool waitHere) { HIPCHECK(hipSetDevice(sideDeviceId)); @@ -389,7 +407,7 @@ int main(int argc, char *argv[]) initArray_h[i] = initValue; } HIPCHECK(hipMemcpy(initArray_d, initArray_h, sizeElements, hipMemcpyHostToDevice)); - + int numDevices; HIPCHECK(hipGetDeviceCount(&numDevices)); @@ -414,7 +432,7 @@ int main(int argc, char *argv[]) // A sideband stream channel that is independent from above. - // Used to check to ensure the WaitEvent or other synchronization is working correctly since by default sideStream is + // Used to check to ensure the WaitEvent or other synchronization is working correctly since by default sideStream is // asynchronous wrt the other streams. std::vector sideStreams; for (int d=0; d Test 0x1000 simple null stream tests\n"); + printf ("==> Test 0x1000 simple null stream tests\n"); // try some null stream: hipStreamQuery(0); @@ -463,7 +481,7 @@ int main(int argc, char *argv[]) HIPCHECK(hipEventRecord(e1, s1)) HIPCHECK(hipStreamWaitEvent(hipStream_t(0), e1, 0/*flags*/)); - + HIPCHECK(hipStreamDestroy(s1)); HIPCHECK(hipEventDestroy(e1)); } @@ -476,11 +494,11 @@ int main(int argc, char *argv[]) HIPCHECK(hipEventRecord(e1, hipStream_t(0))) HIPCHECK(hipStreamWaitEvent(s1, e1, 0/*flags*/)); - + HIPCHECK(hipStreamDestroy(s1)); HIPCHECK(hipEventDestroy(e1)); } - + } diff --git a/tests/src/stress/hipStressKernel.cpp b/tests/src/stress/hipStressKernel.cpp index 7b5eec5a80..52d8fa1fe9 100644 --- a/tests/src/stress/hipStressKernel.cpp +++ b/tests/src/stress/hipStressKernel.cpp @@ -57,5 +57,8 @@ int main(){ } std::cout< Date: Wed, 1 Nov 2017 22:33:13 +0000 Subject: [PATCH 02/35] Correctly deal with functions from shared objects, wherein the program visible VA == so_base_va + st_value(function_symbol). Remove quaint usage of pfe for hipMemset (which is actually fill_n). --- src/hip_memory.cpp | 133 +++++++++++++++++++----------------------- src/program_state.cpp | 108 ++++++++++++++++++++++++---------- 2 files changed, 136 insertions(+), 105 deletions(-) diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index 96fc25c27d..32e0016178 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -1153,42 +1153,56 @@ hipError_t hipMemcpy3D(const struct hipMemcpy3DParms *p) return ihipLogStatus(e); } -// TODO - make member function of stream? +namespace +{ + template< + uint32_t block_dim, + typename RandomAccessIterator, + typename N, + typename T> + __global__ + void hip_fill_n(RandomAccessIterator f, N n, T value) + { + const uint32_t grid_dim = hipGridDim_x; + + size_t idx = hipBlockIdx_x * block_dim + hipThreadIdx_x; + while (idx < n) { + new (&f[idx]) T{value}; + idx += grid_dim; + } + } + + template< + typename T, + typename std::enable_if{}>::type* = nullptr> + inline + const T& clamp_integer(const T& x, const T& lower, const T& upper) + { + assert(!(upper < lower)); + + return std::min(upper, std::max(x, lower)); + } +} + template void ihipMemsetKernel(hipStream_t stream, - LockedAccessor_StreamCrit_t &crit, - T * ptr, T val, size_t sizeBytes, - hc::completion_future *cf) + T * ptr, T val, size_t sizeBytes) { - int wg = std::min((unsigned)8, stream->getDevice()->_computeUnits); - const int threads_per_wg = 256; + static constexpr uint32_t block_dim = 256; - int threads = wg * threads_per_wg; - if (threads > sizeBytes) { - threads = ((sizeBytes + threads_per_wg - 1) / threads_per_wg) * threads_per_wg; - } - - - hc::extent<1> ext(threads); - auto ext_tile = ext.tile(threads_per_wg); - - *cf = - hc::parallel_for_each( - crit->_av, - ext_tile, - [=] (hc::tiled_index<1> idx) - __attribute__((hc)) - { - int offset = amp_get_global_id(0); - // TODO-HCC - change to hc_get_local_size() - int stride = amp_get_local_size(0) * hc_get_num_groups(0) ; - - for (int i=offset; i( + sizeBytes / block_dim, 1, UINT32_MAX); + hipLaunchKernelGGL( + hip_fill_n, + dim3(grid_dim), + dim3{block_dim}, + 0u, + stream, + ptr, + sizeBytes, + std::move(val)); } @@ -1202,17 +1216,12 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s stream = ihipSyncAndResolveStream(stream); if (stream) { - auto crit = stream->lockopen_preKernelCommand(); - - - hc::completion_future cf ; - if ((sizeBytes & 0x3) == 0) { // use a faster dword-per-workitem copy: try { value = value & 0xff; uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ; - ihipMemsetKernel (stream, crit, static_cast (dst), value32, sizeBytes/sizeof(uint32_t), &cf); + ihipMemsetKernel (stream, static_cast (dst), value32, sizeBytes/sizeof(uint32_t)); } catch (std::exception &ex) { e = hipErrorInvalidValue; @@ -1220,19 +1229,16 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s } else { // use a slow byte-per-workitem copy: try { - ihipMemsetKernel (stream, crit, static_cast (dst), value, sizeBytes, &cf); + ihipMemsetKernel (stream, static_cast (dst), value, sizeBytes); } catch (std::exception &ex) { e = hipErrorInvalidValue; } } - stream->lockclose_postKernelCommand("hipMemsetAsync", &crit->_av); - - if (HIP_API_BLOCKING) { tprintf (DB_SYNC, "%s LAUNCH_BLOCKING wait for hipMemsetAsync.\n", ToString(stream).c_str()); - cf.wait(); + stream->locked_wait(); } } else { e = hipErrorInvalidValue; @@ -1253,16 +1259,12 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes) stream = ihipSyncAndResolveStream(stream); if (stream) { - auto crit = stream->lockopen_preKernelCommand(); - - hc::completion_future cf ; - if ((sizeBytes & 0x3) == 0) { // use a faster dword-per-workitem copy: try { value = value & 0xff; uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ; - ihipMemsetKernel (stream, crit, static_cast (dst), value32, sizeBytes/sizeof(uint32_t), &cf); + ihipMemsetKernel (stream, static_cast (dst), value32, sizeBytes/sizeof(uint32_t)); } catch (std::exception &ex) { e = hipErrorInvalidValue; @@ -1270,21 +1272,18 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes) } else { // use a slow byte-per-workitem copy: try { - ihipMemsetKernel (stream, crit, static_cast (dst), value, sizeBytes, &cf); + ihipMemsetKernel (stream, static_cast (dst), value, sizeBytes); } catch (std::exception &ex) { e = hipErrorInvalidValue; } } // TODO - is hipMemset supposed to be async? - cf.wait(); - - stream->lockclose_postKernelCommand("hipMemset", &crit->_av); - + stream->locked_wait(); if (HIP_LAUNCH_BLOCKING) { tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset in %s.\n", __func__, ToString(stream).c_str()); - cf.wait(); + stream->locked_wait(); tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed in %s.\n", __func__, ToString(stream).c_str()); } } else { @@ -1305,17 +1304,13 @@ hipError_t hipMemset2D(void* dst, size_t pitch, int value, size_t width, size_t stream = ihipSyncAndResolveStream(stream); if (stream) { - auto crit = stream->lockopen_preKernelCommand(); - - hc::completion_future cf ; - size_t sizeBytes = pitch * height; if ((sizeBytes & 0x3) == 0) { // use a faster dword-per-workitem copy: try { value = value & 0xff; uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ; - ihipMemsetKernel (stream, crit, static_cast (dst), value32, sizeBytes/sizeof(uint32_t), &cf); + ihipMemsetKernel (stream, static_cast (dst), value32, sizeBytes/sizeof(uint32_t)); } catch (std::exception &ex) { e = hipErrorInvalidValue; @@ -1323,20 +1318,18 @@ hipError_t hipMemset2D(void* dst, size_t pitch, int value, size_t width, size_t } else { // use a slow byte-per-workitem copy: try { - ihipMemsetKernel (stream, crit, static_cast (dst), value, sizeBytes, &cf); + ihipMemsetKernel (stream, static_cast (dst), value, sizeBytes); } catch (std::exception &ex) { e = hipErrorInvalidValue; } } // TODO - is hipMemset supposed to be async? - cf.wait(); - - stream->lockclose_postKernelCommand("hipMemset", &crit->_av); + stream->locked_wait(); if (HIP_LAUNCH_BLOCKING) { tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset in %s.\n", __func__, ToString(stream).c_str()); - cf.wait(); + stream->locked_wait(); tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed in %s.\n", __func__, ToString(stream).c_str()); } } else { @@ -1357,36 +1350,30 @@ hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t sizeByte stream = ihipSyncAndResolveStream(stream); if (stream) { - auto crit = stream->lockopen_preKernelCommand(); - - hc::completion_future cf ; - if ((sizeBytes & 0x3) == 0) { // use a faster dword-per-workitem copy: try { uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ; - ihipMemsetKernel (stream, crit, static_cast (dst), value32, sizeBytes/sizeof(uint32_t), &cf); + ihipMemsetKernel (stream, static_cast (dst), value32, sizeBytes/sizeof(uint32_t)); } catch (std::exception &ex) { + std::cout << ex.what() << std::endl; e = hipErrorInvalidValue; } } else { // use a slow byte-per-workitem copy: try { - ihipMemsetKernel (stream, crit, static_cast (dst), value, sizeBytes, &cf); + ihipMemsetKernel (stream, static_cast (dst), value, sizeBytes); } catch (std::exception &ex) { e = hipErrorInvalidValue; } } - cf.wait(); - - stream->lockclose_postKernelCommand("hipMemsetD8", &crit->_av); - + stream->locked_wait(); if (HIP_LAUNCH_BLOCKING) { tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset in %s.\n", __func__, ToString(stream).c_str()); - cf.wait(); + stream->locked_wait(); tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed in %s.\n", __func__, ToString(stream).c_str()); } } else { diff --git a/src/program_state.cpp b/src/program_state.cpp index be871a6e84..a4f7fdbdbe 100644 --- a/src/program_state.cpp +++ b/src/program_state.cpp @@ -195,9 +195,9 @@ namespace static vector> blobs{ code_object_blob_for_process()}; - dl_iterate_phdr([](dl_phdr_info* i, std::size_t, void*) { + dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void*) { elfio tmp; - if (tmp.load(i->dlpi_name)) { + if (tmp.load(info->dlpi_name)) { const auto it = find_section_if(tmp, [](const section* x) { return x->get_name() == ".kernel"; }); @@ -269,6 +269,61 @@ namespace return r; } + vector> function_names_for( + const elfio& reader, section* symtab) + { + vector> r; + symbol_section_accessor symbols{reader, symtab}; + + auto foo = reader.get_entry(); + + for (auto i = 0u; i != symbols.get_symbols_num(); ++i) { + // TODO: this is boyscout code, caching the temporaries + // may be of worth. + string name; + Elf64_Addr value = 0; + Elf_Xword size = 0; + Elf_Half sect_idx = 0; + uint8_t bind = 0; + uint8_t type = 0; + uint8_t other = 0; + + symbols.get_symbol( + i, name, value, size, bind, type, sect_idx, other); + + if (type == STT_FUNC && sect_idx != SHN_UNDEF && !name.empty()) { + r.emplace_back(value, name); + } + } + + return r; + } + + const vector>& function_names_for_process() + { + static constexpr const char self[] = "/proc/self/exe"; + + static vector> r; + static once_flag f; + + call_once(f, []() { + elfio reader; + + if (!reader.load(self)) { + throw runtime_error{ + "Failed to load the ELF file for the current process."}; + } + + auto symtab = find_section_if(reader, [](const section* x) { + return x->get_type() == SHT_SYMTAB; + }); + + r = function_names_for(reader, symtab); + }); + + return r; + } + inline hsa_agent_t agent(hsa_executable_symbol_t x) { @@ -395,43 +450,32 @@ namespace hip_impl { const unordered_map& function_names() { - static constexpr const char self[] = "/proc/self/exe"; - - static unordered_map r; + static unordered_map r{ + function_names_for_process().cbegin(), + function_names_for_process().cend()}; static once_flag f; call_once(f, []() { - elfio reader; + dl_iterate_phdr([](dl_phdr_info* info, size_t, void*) { + elfio tmp; + if (tmp.load(info->dlpi_name)) { + const auto it = find_section_if(tmp, [](const section* x) { + return x->get_type() == SHT_SYMTAB; + }); - if (!reader.load(self)) { - throw runtime_error{ - "Failed to load the ELF file for the current process."}; - } + if (it) { + auto n = function_names_for(tmp, it); - auto symtab = find_section_if(reader, [](const section* x) { - return x->get_type() == SHT_SYMTAB; - }); + for (auto&& f : n) f.first += info->dlpi_addr; - symbol_section_accessor symbols{reader, symtab}; - - for (auto i = 0u; i != symbols.get_symbols_num(); ++i) { - // TODO: this is boyscout code, caching the temporaries - // may be of worth. - string name; - Elf64_Addr value = 0; - Elf_Xword size = 0; - Elf_Half sect_idx = 0; - uint8_t bind = 0; - uint8_t type = 0; - uint8_t other = 0; - - symbols.get_symbol( - i, name, value, size, bind, type, sect_idx, other); - - if (type == STT_FUNC && sect_idx != SHN_UNDEF && !name.empty()) { - r.emplace(value, name); + r.insert( + make_move_iterator(n.begin()), + make_move_iterator(n.end())); + } } - } + + return 0; + }, nullptr); }); return r; From bd784596905881fa23e1a9e98bea72675c863124 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Fri, 10 Nov 2017 00:14:34 +0000 Subject: [PATCH 03/35] Update new tests so as to make them work with new variadic based launch mechanisms. --- tests/src/runtimeApi/memory/hipMemcpyDtoD.cpp | 34 ++++++++++++++----- .../runtimeApi/memory/hipMemcpyDtoDAsync.cpp | 34 ++++++++++++++----- tests/src/runtimeApi/memory/hipMemcpyPeer.cpp | 30 ++++++++++++---- .../runtimeApi/memory/hipMemcpyPeerAsync.cpp | 32 +++++++++++++---- 4 files changed, 101 insertions(+), 29 deletions(-) diff --git a/tests/src/runtimeApi/memory/hipMemcpyDtoD.cpp b/tests/src/runtimeApi/memory/hipMemcpyDtoD.cpp index ccb02b74ce..c64b01f8a7 100644 --- a/tests/src/runtimeApi/memory/hipMemcpyDtoD.cpp +++ b/tests/src/runtimeApi/memory/hipMemcpyDtoD.cpp @@ -49,21 +49,39 @@ int main() HIPCHECK(hipMalloc(&Y_d,Nbytes)); HIPCHECK(hipMalloc(&Z_d,Nbytes)); - + HIPCHECK(hipSetDevice(0)); HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N); + hipLaunchKernel( + HipTest::vectorADD, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(A_d), + static_cast(B_d), + C_d, + N); HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); HIPCHECK(hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); - - + + HIPCHECK(hipSetDevice(1)); - HIPCHECK(hipMemcpyDtoD(X_d, A_d, Nbytes)); + HIPCHECK(hipMemcpyDtoD(X_d, A_d, Nbytes)); HIPCHECK(hipMemcpyDtoD(Y_d, B_d, Nbytes)); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N); + hipLaunchKernel( + HipTest::vectorADD, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(X_d), + static_cast(Y_d), + Z_d, + N); HIPCHECK(hipMemcpyDtoH(C_h, Z_d, Nbytes)); HIPCHECK(hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); @@ -73,8 +91,8 @@ int main() HIPCHECK(hipFree(Y_d)); HIPCHECK(hipFree(Z_d)); } - + passed(); - + } diff --git a/tests/src/runtimeApi/memory/hipMemcpyDtoDAsync.cpp b/tests/src/runtimeApi/memory/hipMemcpyDtoDAsync.cpp index 5c99b43564..6d21ac62e7 100644 --- a/tests/src/runtimeApi/memory/hipMemcpyDtoDAsync.cpp +++ b/tests/src/runtimeApi/memory/hipMemcpyDtoDAsync.cpp @@ -50,25 +50,43 @@ int main() HIPCHECK(hipMalloc(&Y_d,Nbytes)); HIPCHECK(hipMalloc(&Z_d,Nbytes)); - + HIPCHECK(hipSetDevice(0)); HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N); + hipLaunchKernel( + HipTest::vectorADD, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(A_d), + static_cast(B_d), + C_d, + N); HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); HIPCHECK(hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); - - HIPCHECK(hipStreamCreate(&s)); + + HIPCHECK(hipStreamCreate(&s)); HIPCHECK(hipSetDevice(1)); - HIPCHECK(hipMemcpyDtoDAsync(X_d, A_d, Nbytes, s)); + HIPCHECK(hipMemcpyDtoDAsync(X_d, A_d, Nbytes, s)); HIPCHECK(hipMemcpyDtoDAsync(Y_d, B_d, Nbytes, s)); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N); + hipLaunchKernel( + HipTest::vectorADD, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(X_d), + static_cast(Y_d), + Z_d, + N); HIPCHECK(hipMemcpyDtoHAsync(C_h, Z_d, Nbytes, s)); HIPCHECK(hipStreamSynchronize(s)); HIPCHECK(hipDeviceSynchronize()); - + HipTest::checkVectorADD(A_h, B_h, C_h, N); HIPCHECK(hipStreamDestroy(s)); HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); @@ -78,7 +96,7 @@ int main() } passed(); - + } diff --git a/tests/src/runtimeApi/memory/hipMemcpyPeer.cpp b/tests/src/runtimeApi/memory/hipMemcpyPeer.cpp index 7e2fc2d3d0..95b19c1090 100644 --- a/tests/src/runtimeApi/memory/hipMemcpyPeer.cpp +++ b/tests/src/runtimeApi/memory/hipMemcpyPeer.cpp @@ -48,24 +48,42 @@ int main() HIPCHECK(hipMalloc(&X_d,Nbytes)); HIPCHECK(hipMalloc(&Y_d,Nbytes)); HIPCHECK(hipMalloc(&Z_d,Nbytes)); - + HIPCHECK(hipSetDevice(0)); HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N); + hipLaunchKernel( + HipTest::vectorADD, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(A_d), + static_cast(B_d), + C_d, + N); HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); HIPCHECK(hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); - + HIPCHECK(hipSetDevice(1)); hipMemcpyPeer(X_d, 1, A_d, 0, Nbytes); //this call is eqv to hipMemcpy(hipMemcpyD2D) which goes via stg bufs. hipMemcpyPeer(Y_d, 1, B_d, 0, Nbytes); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N); + hipLaunchKernel( + HipTest::vectorADD, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(X_d), + static_cast(Y_d), + Z_d, + N); HIPCHECK(hipMemcpy(C_h, Z_d, Nbytes, hipMemcpyDeviceToHost)); HIPCHECK(hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); - + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); HIPCHECK(hipFree(X_d)); HIPCHECK(hipFree(Y_d)); @@ -74,7 +92,7 @@ int main() passed(); - + } diff --git a/tests/src/runtimeApi/memory/hipMemcpyPeerAsync.cpp b/tests/src/runtimeApi/memory/hipMemcpyPeerAsync.cpp index 9d46ccb0d8..943e4a6b95 100644 --- a/tests/src/runtimeApi/memory/hipMemcpyPeerAsync.cpp +++ b/tests/src/runtimeApi/memory/hipMemcpyPeerAsync.cpp @@ -51,26 +51,44 @@ int main() HIPCHECK(hipMalloc(&Y_d,Nbytes)); HIPCHECK(hipMalloc(&Z_d,Nbytes)); - + HIPCHECK(hipSetDevice(0)); HIPCHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); HIPCHECK ( hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N); + hipLaunchKernel( + HipTest::vectorADD, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(A_d), + static_cast(B_d), + C_d, + N); HIPCHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); HIPCHECK (hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); - - HIPCHECK(hipStreamCreate(&s)); + + HIPCHECK(hipStreamCreate(&s)); HIPCHECK(hipSetDevice(1)); HIPCHECK(hipMemcpyPeerAsync(X_d, 1, A_d, 0, Nbytes, s)); HIPCHECK(hipMemcpyPeerAsync(Y_d, 1, B_d, 0, Nbytes, s)); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N); + hipLaunchKernel( + HipTest::vectorADD, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(X_d), + static_cast(Y_d), + Z_d, + N); HIPCHECK ( hipMemcpy(C_h, Z_d, Nbytes, hipMemcpyDeviceToHost)); HIPCHECK (hipDeviceSynchronize()); HIPCHECK (hipStreamSynchronize(s)); HipTest::checkVectorADD(A_h, B_h, C_h, N); - + HIPCHECK(hipStreamDestroy(s)); HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); HIPCHECK(hipFree(X_d)); @@ -79,7 +97,7 @@ int main() } passed(); - + } From 819e72fba62bd7b79e3ee79e2f3b47c31ce1961d Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Fri, 10 Nov 2017 01:20:50 +0000 Subject: [PATCH 04/35] Add omitted changes in CMakeLists.txt. --- CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1c8f640afb..e405d06ed6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -167,6 +167,7 @@ if(HIP_PLATFORM STREQUAL "hcc") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${HIP_HCC_BUILD_FLAGS}") set(SOURCE_FILES_RUNTIME + src/code_object_bundle.cpp src/hip_hcc.cpp src/hip_context.cpp src/hip_device.cpp @@ -179,7 +180,8 @@ if(HIP_PLATFORM STREQUAL "hcc") src/hip_db.cpp src/grid_launch.cpp src/hip_texture.cpp - src/env.cpp) + src/env.cpp + src/program_state.cpp) set(SOURCE_FILES_DEVICE src/device_util.cpp From 153878e368d4e2966bd5c2524e1fd8c3e1b686ff Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Fri, 17 Nov 2017 16:00:28 +0000 Subject: [PATCH 05/35] This fixes some outright quaint choices made when implementing HIP's bitwise conversion functions, by using simple reinterpret_casts, as is idiomatic. These functions are supposed to be re-entrant, correct and efficient. Sadly, they were neither: they hid a massive race condition against a value stored in global memory, which means that they were also unreasonably slow if they ever managed to be correct, and relied on union based type punning which is in a grey area of the standard. It is difficult to ascertain what may have been the reason for coming up with this quirky solution. --- src/device_functions.cpp | 53 +++++++++------------------------------- 1 file changed, 12 insertions(+), 41 deletions(-) diff --git a/src/device_functions.cpp b/src/device_functions.cpp index 615ae4d0b7..a66cc1e9fb 100644 --- a/src/device_functions.cpp +++ b/src/device_functions.cpp @@ -23,27 +23,6 @@ THE SOFTWARE. #include #include "device_util.h" -struct holder64Bit{ - union{ - double d; - unsigned long int uli; - signed long int sli; - signed int si[2]; - unsigned int ui[2]; - }; -} __attribute__((aligned(8))); - -struct holder32Bit { - union { - float f; - unsigned int ui; - signed int si; - }; -} __attribute__((aligned(4))); - -__device__ struct holder64Bit hold64; -__device__ struct holder32Bit hold32; - __device__ float __double2float_rd(double x) { return (double)x; @@ -64,13 +43,11 @@ __device__ float __double2float_rz(double x) __device__ int __double2hiint(double x) { - hold64.d = x; - return hold64.si[1]; + return reinterpret_cast(x)[1]; } __device__ int __double2loint(double x) { - hold64.d = x; - return hold64.si[0]; + return reinterpret_cast(x)[0]; } @@ -145,8 +122,7 @@ __device__ unsigned long long int __double2ull_rz(double x) __device__ long long int __double_as_longlong(double x) { - hold64.d = x; - return hold64.sli; + return reinterpret_cast(x); } __device__ int __float2int_rd(float x) @@ -219,19 +195,17 @@ __device__ unsigned long long int __float2ull_rz(float x) __device__ int __float_as_int(float x) { - hold32.f = x; - return hold32.si; + return reinterpret_cast(x); } __device__ unsigned int __float_as_uint(float x) { - hold32.f = x; - return hold32.ui; + return reinterpret_cast(x); } __device__ double __hiloint2double(int hi, int lo) -{ - hold64.si[1] = hi; - hold64.si[0] = lo; - return hold64.d; +{ // TODO: this matches the original in not considering endianness, is that + // correct though? + int tmp[] = {lo, hi}; + return reinterpret_cast(tmp); } __device__ double __int2double_rn(int x) { @@ -257,8 +231,7 @@ __device__ float __int2float_rz(int x) __device__ float __int_as_float(int x) { - hold32.si = x; - return hold32.f; + return reinterpret_cast(x); } __device__ double __ll2double_rd(long long int x) @@ -297,8 +270,7 @@ __device__ float __ll2float_rz(long long int x) __device__ double __longlong_as_double(long long int x) { - hold64.sli = x; - return hold64.d; + return reinterpret_cast(x); } __device__ double __uint2double_rn(int x) @@ -325,8 +297,7 @@ __device__ float __uint2float_rz(unsigned int x) __device__ float __uint_as_float(unsigned int x) { - hold32.ui = x; - return hold32.f; + return reinterpret_cast(x); } __device__ double __ull2double_rd(unsigned long long int x) From 6fa7adf077870ccc40aaf606a941d6fc938207cf Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Sat, 18 Nov 2017 01:16:31 +0000 Subject: [PATCH 06/35] This actually (tries) to do the right thing all the way, by using memcpy for bitcasting, and not rely on undefined behaviour of a different flavour as a substitute for the original undefined behaviour. Note that the compiler will (should) optimise down to the same emitted code, since this is a pattern it understands. --- src/device_functions.cpp | 71 ++++++++++++++++++++++++++++++++-------- 1 file changed, 58 insertions(+), 13 deletions(-) diff --git a/src/device_functions.cpp b/src/device_functions.cpp index a66cc1e9fb..63425bc9f4 100644 --- a/src/device_functions.cpp +++ b/src/device_functions.cpp @@ -43,11 +43,21 @@ __device__ float __double2float_rz(double x) __device__ int __double2hiint(double x) { - return reinterpret_cast(x)[1]; + static_assert(sizeof(double) == 2 * sizeof(int), ""); + + int tmp[2]; + __builtin_memcpy(tmp, &x, sizeof(tmp)); + + return tmp[1]; } __device__ int __double2loint(double x) { - return reinterpret_cast(x)[0]; + static_assert(sizeof(double) == 2 * sizeof(int), ""); + + int tmp[2]; + __builtin_memcpy(tmp, &x, sizeof(tmp)); + + return tmp[0]; } @@ -122,7 +132,12 @@ __device__ unsigned long long int __double2ull_rz(double x) __device__ long long int __double_as_longlong(double x) { - return reinterpret_cast(x); + static_assert(sizeof(long long) == sizeof(double), ""); + + long long tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return tmp; } __device__ int __float2int_rd(float x) @@ -195,17 +210,32 @@ __device__ unsigned long long int __float2ull_rz(float x) __device__ int __float_as_int(float x) { - return reinterpret_cast(x); + static_assert(sizeof(int) == sizeof(float), ""); + + int tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return tmp; } __device__ unsigned int __float_as_uint(float x) { - return reinterpret_cast(x); + static_assert(sizeof(unsigned int) == sizeof(float), ""); + + unsigned int tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return tmp; } -__device__ double __hiloint2double(int hi, int lo) -{ // TODO: this matches the original in not considering endianness, is that - // correct though? - int tmp[] = {lo, hi}; - return reinterpret_cast(tmp); +__device__ double __hiloint2double(int32_t hi, int32_t lo) +{ + static_assert(sizeof(double) == sizeof(uint64_t), ""); + + uint64_t tmp0 = + (static_cast(hi) << 32ull) | static_cast(lo); + double tmp1; + __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + + return tmp1; } __device__ double __int2double_rn(int x) { @@ -231,7 +261,12 @@ __device__ float __int2float_rz(int x) __device__ float __int_as_float(int x) { - return reinterpret_cast(x); + static_assert(sizeof(float) == sizeof(int), ""); + + float tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return tmp; } __device__ double __ll2double_rd(long long int x) @@ -270,7 +305,12 @@ __device__ float __ll2float_rz(long long int x) __device__ double __longlong_as_double(long long int x) { - return reinterpret_cast(x); + static_assert(sizeof(double) == sizeof(long long), ""); + + double tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return x; } __device__ double __uint2double_rn(int x) @@ -297,7 +337,12 @@ __device__ float __uint2float_rz(unsigned int x) __device__ float __uint_as_float(unsigned int x) { - return reinterpret_cast(x); + static_assert(sizeof(float) == sizeof(unsigned int), ""); + + float tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return tmp; } __device__ double __ull2double_rd(unsigned long long int x) From 7d5a45ac1a953f3a7958fbc138ec792834fcd64d Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Mon, 20 Nov 2017 16:33:52 +0000 Subject: [PATCH 07/35] Correct ill-formed merge in earlier commit and adjust for differences with the new CUDA natural indexing mechanism. --- include/hip/hcc_detail/hip_runtime.h | 15 ++++++++++++--- src/grid_launch.cpp | 17 ----------------- 2 files changed, 12 insertions(+), 20 deletions(-) diff --git a/include/hip/hcc_detail/hip_runtime.h b/include/hip/hcc_detail/hip_runtime.h index 924e774af0..c2302d4dc1 100644 --- a/include/hip/hcc_detail/hip_runtime.h +++ b/include/hip/hcc_detail/hip_runtime.h @@ -389,9 +389,18 @@ templatelockopen_preKernelCommand()); - - stream->lockclose_postKernelCommand(kernel_name, acc_v); - - delete static_cast(locked_stream); - locked_stream = nullptr; - if(HIP_PROFILE_API) { - MARKER_END(); ->>>>>>> e8ede28ec4f5744185b171031e537237afb7affa } } } From 1824fb76985d1a4b172914501617fe34bc12df2d Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Mon, 20 Nov 2017 22:41:46 +0000 Subject: [PATCH 08/35] Clean-up some remaining noise in program_state.cpp. --- src/program_state.cpp | 45 ++++++++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/src/program_state.cpp b/src/program_state.cpp index a4f7fdbdbe..d5e2f80a05 100644 --- a/src/program_state.cpp +++ b/src/program_state.cpp @@ -17,9 +17,11 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -132,21 +134,39 @@ namespace const auto tmp = find_symbol_address( symbol_section_accessor{self_reader, process_symtab}, x); - assert(tmp.first); + if (!tmp.first) { + throw runtime_error{ + "The global variable: " + x + ", could not be found."}; + } - void* p = nullptr; - hsa_amd_memory_lock( - reinterpret_cast(tmp.first), tmp.second, &agent, 1, &p); + static unordered_map< + Elf64_Addr, + unique_ptr> globals; + + if (globals.count(tmp.first) == 0) { + void* p = nullptr; + hsa_amd_memory_lock( + reinterpret_cast(tmp.first), + tmp.second, + &agent, + 1, + &p); + + static mutex mtx; + + lock_guard lck{mtx}; + globals.emplace( + piecewise_construct, + make_tuple(tmp.first), + make_tuple(p, hsa_amd_memory_unlock)); + } + + const auto it = globals.find(tmp.first); + + assert(it != globals.cend()); hsa_executable_agent_global_variable_define( - executable, agent, x.c_str(), p); - - static vector< - unique_ptr> globals; - static mutex mtx; - - lock_guard lck{mtx}; - globals.emplace_back(p, hsa_amd_memory_unlock); + executable, agent, x.c_str(), it->second.get()); } } @@ -265,7 +285,6 @@ namespace } }); - cout << r.size() << endl; return r; } From 9d088d22836e663df60ce66e08d3fdd28cf636fc Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Tue, 21 Nov 2017 02:40:34 +0000 Subject: [PATCH 09/35] Refactor the __device__ versions of memset and memcpy to be less awkward i.e. not return nullptr as opposed to the destination pointer (it can only be assumed it was done for maximum confusion) and actually unroll as they claim to. Change all of the {to, from}Symbol functions to use hipModuleGetGlobal, as opposed to hc::accelerator::get_symbol_address which is no longer valid with module based dispatch. --- include/hip/hcc_detail/program_state.hpp | 20 ++++ src/device_util.cpp | 49 +++++++--- src/hip_memory.cpp | 24 +++-- src/hip_module.cpp | 119 ++++++++++++++++------- src/program_state.cpp | 107 +++++++++----------- 5 files changed, 206 insertions(+), 113 deletions(-) diff --git a/include/hip/hcc_detail/program_state.hpp b/include/hip/hcc_detail/program_state.hpp index 03701725eb..0e21b12f5f 100644 --- a/include/hip/hcc_detail/program_state.hpp +++ b/include/hip/hcc_detail/program_state.hpp @@ -35,6 +35,24 @@ THE SOFTWARE. struct ihipModuleSymbol_t; using hipFunction_t = ihipModuleSymbol_t*; +namespace std +{ + template<> + struct hash { + size_t operator()(hsa_agent_t x) const + { + return hash{}(x.handle); + } + }; +} + +inline +constexpr +bool operator==(hsa_agent_t x, hsa_agent_t y) +{ + return x.handle == y.handle; +} + namespace hip_impl { struct Kernel_descriptor { @@ -50,6 +68,8 @@ namespace hip_impl } }; + const std::unordered_map< + hsa_agent_t, std::vector>& executables(); const std::unordered_map< std::uintptr_t, std::vector>>& functions(); diff --git a/src/device_util.cpp b/src/device_util.cpp index 367a4c1a4f..b6aebdfce0 100644 --- a/src/device_util.cpp +++ b/src/device_util.cpp @@ -102,23 +102,48 @@ __device__ void* __hip_hc_free(void *ptr) // loop unrolling __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) { - uint8_t *dstPtr, *srcPtr; - dstPtr = (uint8_t*)dst; - srcPtr = (uint8_t*)src; - for(uint32_t i=0;i(dst); + auto srcPtr = static_cast(src); + + while (size >= 4u) { + dstPtr[0] = srcPtr[0]; + dstPtr[1] = srcPtr[1]; + dstPtr[2] = srcPtr[2]; + dstPtr[3] = srcPtr[3]; + + size -= 4u; + srcPtr += 4u; + dstPtr += 4u; } - return nullptr; + switch (size) { + case 3: dstPtr[2] = srcPtr[2]; + case 2: dstPtr[1] = srcPtr[1]; + case 1: dstPtr[0] = srcPtr[0]; + } + + return dst; } -__device__ void* __hip_hc_memset(void* ptr, uint8_t val, size_t size) +__device__ void* __hip_hc_memset(void* dst, uint8_t val, size_t size) { - uint8_t *dstPtr; - dstPtr = (uint8_t*)ptr; - for(uint32_t i=0;i(dst); + + while (size >= 4u) { + dstPtr[0] = val; + dstPtr[1] = val; + dstPtr[2] = val; + dstPtr[3] = val; + + size -= 4u; + dstPtr += 4u; } - return nullptr; + switch (size) { + case 3: dstPtr[2] = val; + case 2: dstPtr[1] = val; + case 1: dstPtr[0] = val; + } + + return dst; } __device__ float __hip_erfinvf(float x){ diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index 047cf76c08..04ea38fcd5 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -715,7 +715,10 @@ hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t cou hc::accelerator acc = ctx->getDevice()->_acc; - void *dst = acc.get_symbol_address((const char*) symbolName); + hipDeviceptr_t dst = nullptr; + size_t byte_cnt = 0u; + auto status = hipModuleGetGlobal( + &dst, &byte_cnt, 0, static_cast(symbolName)); tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst); if(dst == nullptr) @@ -750,7 +753,10 @@ hipError_t hipMemcpyFromSymbol(void* dst, const void* symbolName, size_t count, hc::accelerator acc = ctx->getDevice()->_acc; - void *src = acc.get_symbol_address((const char*) symbolName); + hipDeviceptr_t src = nullptr; + size_t byte_cnt = 0u; + auto status = hipModuleGetGlobal( + &src, &byte_cnt, 0, static_cast(symbolName)); tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst); if(dst == nullptr) @@ -787,7 +793,10 @@ hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void *src, size_ hc::accelerator acc = ctx->getDevice()->_acc; - void *dst = acc.get_symbol_address((const char*) symbolName); + hipDeviceptr_t dst = nullptr; + size_t byte_cnt = 0u; + auto status = hipModuleGetGlobal( + &dst, &byte_cnt, 0, static_cast(symbolName)); tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst); if(dst == nullptr) @@ -825,7 +834,10 @@ hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName, size_t co hc::accelerator acc = ctx->getDevice()->_acc; - void *src = acc.get_symbol_address((const char*) symbolName); + hipDeviceptr_t src = nullptr; + size_t byte_cnt = 0u; + auto status = hipModuleGetGlobal( + &src, &byte_cnt, 0, static_cast(symbolName)); tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, src); if(src == nullptr || dst == nullptr) @@ -1171,9 +1183,9 @@ namespace __global__ void hip_fill_n(RandomAccessIterator f, N n, T value) { - const uint32_t grid_dim = hipGridDim_x; + const uint32_t grid_dim = gridDim.x * blockDim.x; - size_t idx = hipBlockIdx_x * block_dim + hipThreadIdx_x; + size_t idx = blockIdx.x * block_dim + threadIdx.x; while (idx < n) { new (&f[idx]) T{value}; idx += grid_dim; diff --git a/src/hip_module.cpp b/src/hip_module.cpp index df847f9f64..fb1cf29df8 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -554,16 +554,93 @@ namespace } inline - std::vector read_agent_globals(hipModule_t hmodule) + std::vector read_agent_globals( + hsa_agent_t agent, hsa_executable_t executable) { std::vector r; - hsa_executable_iterate_agent_symbols( - hmodule->executable, this_agent(), copy_agent_global_variables, &r); + executable, agent, copy_agent_global_variables, &r); return r; } + + template + std::pair read_global_description( + ForwardIterator f, ForwardIterator l, const char* name) + { + const auto it = std::find_if( + f, l, [=](const Agent_global& x) { return x.name == name; }); + + return it == l ? + std::make_pair(nullptr, 0u) : + std::make_pair(it->address, it->byte_cnt); + } + + hipError_t read_agent_global_from_module( + hipDeviceptr_t *dptr, + size_t* bytes, + hipModule_t hmod, + const char* name) + { + static std::unordered_map< + hipModule_t, std::vector> agent_globals; + + // TODO: this is not particularly robust. + if (agent_globals.count(hmod) == 0) { + static std::mutex mtx; + std::lock_guard lck{mtx}; + + if (agent_globals.count(hmod) == 0) { + agent_globals.emplace( + hmod, read_agent_globals(this_agent(), hmod->executable)); + } + } + + // TODO: This is unsafe iff some other emplacement triggers rehashing. + // It will have to be properly fleshed out in the future. + const auto it0 = agent_globals.find(hmod); + if (it0 == agent_globals.cend()) { + throw std::runtime_error{"agent_globals data structure corrupted."}; + } + + std::tie(*dptr, *bytes) = read_global_description( + it0->second.cbegin(), it0->second.cend(), name); + + return dptr ? hipSuccess : hipErrorNotFound; + } + + hipError_t read_agent_global_from_process( + hipDeviceptr_t *dptr, size_t* bytes, const char* name) + { + static std::unordered_map< + hsa_agent_t, std::vector> agent_globals; + static std::once_flag f; + + std::call_once(f, []() { + for (auto&& agent_executables : hip_impl::executables()) { + std::vector tmp0; + for (auto&& executable : agent_executables.second) { + auto tmp1 = read_agent_globals( + agent_executables.first, executable); + tmp0.insert( + tmp0.end(), + std::make_move_iterator(tmp1.begin()), + std::make_move_iterator(tmp1.end())); + } + agent_globals.emplace(agent_executables.first, std::move(tmp0)); + } + }); + + const auto it = agent_globals.find(this_agent()); + + if (it == agent_globals.cend()) return hipErrorNotInitialized; + + std::tie(*dptr, *bytes) = read_global_description( + it->second.cbegin(), it->second.cend(), name); + + return dptr ? hipSuccess : hipErrorNotFound; + } } hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, @@ -574,41 +651,15 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, if(dptr == NULL || bytes == NULL){ return ihipLogStatus(hipErrorInvalidValue); } - if(name == NULL || hmod == NULL){ + if(name == NULL){ return ihipLogStatus(hipErrorNotInitialized); } else{ - static std::unordered_map< - hipModule_t, std::vector> agent_globals; + ret = hmod ? + read_agent_global_from_module(dptr, bytes, hmod, name) : + read_agent_global_from_process(dptr, bytes, name); - // TODO: this is not particularly robust. - if (agent_globals.count(hmod) == 0) { - static std::mutex mtx; - std::lock_guard lck{mtx}; - - if (agent_globals.count(hmod) == 0) { - agent_globals.emplace(hmod, read_agent_globals(hmod)); - } - } - - // TODO: This is unsafe iff some other emplacement triggers rehashing. - // It will have to be properly fleshed out in the future. - const auto it0 = agent_globals.find(hmod); - if (it0 == agent_globals.cend()) { - throw std::runtime_error{"agent_globals data structure corrupted."}; - } - - const auto it1 = std::find_if( - it0->second.cbegin(), - it0->second.cend(), - [=](const Agent_global& x) { return x.name == name; }); - - if (it1 == it0->second.cend()) return ihipLogStatus(hipErrorNotFound); - - *dptr = it1->address; - *bytes = it1->byte_cnt; - - return ihipLogStatus(hipSuccess); + return ihipLogStatus(ret); } } diff --git a/src/program_state.cpp b/src/program_state.cpp index d5e2f80a05..2bb115981b 100644 --- a/src/program_state.cpp +++ b/src/program_state.cpp @@ -31,14 +31,6 @@ using namespace std; namespace std { - template<> - struct hash { - size_t operator()(hsa_agent_t x) const - { - return hash{}(x.handle); - } - }; - template<> struct hash { size_t operator()(hsa_isa_t x) const @@ -48,13 +40,6 @@ namespace std }; } -inline -constexpr -bool operator==(hsa_agent_t x, hsa_agent_t y) -{ - return x.handle == y.handle; -} - inline constexpr bool operator==(hsa_isa_t x, hsa_isa_t y) @@ -242,52 +227,6 @@ namespace return r; } - const unordered_map>& executables() - { - static unordered_map> r; - static once_flag f; - - call_once(f, []() { - static const auto accelerators = hc::accelerator::get_all(); - - for (auto&& acc : accelerators) { - auto agent = static_cast(acc.get_hsa_agent()); - - if (!agent) continue; - - hsa_agent_iterate_isas(*agent, [](hsa_isa_t x, void* pa) { - const auto it = code_object_blobs().find(x); - - if (it != code_object_blobs().cend()) { - hsa_agent_t a = *static_cast(pa); - - for (auto&& blob : it->second) { - hsa_executable_t tmp = {}; - - hsa_executable_create_alt( - HSA_PROFILE_FULL, - HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, - nullptr, - &tmp); - - // TODO: this is massively inefficient and only - // meant for illustration. - string blob_to_str{blob.cbegin(), blob.cend()}; - stringstream istr{blob_to_str}; - tmp = load_executable(tmp, a, istr); - - if (tmp.handle) r[a].push_back(tmp); - } - } - - return HSA_STATUS_SUCCESS; - }, agent); - } - }); - - return r; - } - vector> function_names_for( const elfio& reader, section* symtab) { @@ -467,6 +406,52 @@ namespace namespace hip_impl { + const unordered_map>& executables() + { + static unordered_map> r; + static once_flag f; + + call_once(f, []() { + static const auto accelerators = hc::accelerator::get_all(); + + for (auto&& acc : accelerators) { + auto agent = static_cast(acc.get_hsa_agent()); + + if (!agent) continue; + + hsa_agent_iterate_isas(*agent, [](hsa_isa_t x, void* pa) { + const auto it = code_object_blobs().find(x); + + if (it != code_object_blobs().cend()) { + hsa_agent_t a = *static_cast(pa); + + for (auto&& blob : it->second) { + hsa_executable_t tmp = {}; + + hsa_executable_create_alt( + HSA_PROFILE_FULL, + HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, + nullptr, + &tmp); + + // TODO: this is massively inefficient and only + // meant for illustration. + string blob_to_str{blob.cbegin(), blob.cend()}; + stringstream istr{blob_to_str}; + tmp = load_executable(tmp, a, istr); + + if (tmp.handle) r[a].push_back(tmp); + } + } + + return HSA_STATUS_SUCCESS; + }, agent); + } + }); + + return r; + } + const unordered_map& function_names() { static unordered_map r{ From 5e16ee0d1f6ad9457790de20ea58b007c130f086 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Tue, 21 Nov 2017 13:15:13 +0000 Subject: [PATCH 10/35] This corrects how addresses are formed for symbols which reside in shared objects. For this case, the .value component of an ELF symbol holds the offset from the base VA where the shared object was loaded. Thus, to correctly obtain the VA of the object refered by the symbol, we must add the offset to the VA where the shared object is loaded. We were already doing this correctly for symbols denoting functions, but we were incorrect for those denoting objects. --- src/program_state.cpp | 191 +++++++++++++++++++++++------------------- 1 file changed, 106 insertions(+), 85 deletions(-) diff --git a/src/program_state.cpp b/src/program_state.cpp index 2bb115981b..79d692e06f 100644 --- a/src/program_state.cpp +++ b/src/program_state.cpp @@ -49,6 +49,38 @@ bool operator==(hsa_isa_t x, hsa_isa_t y) namespace { + struct Symbol { + std::string name; + ELFIO::Elf64_Addr value = 0; + ELFIO::Elf_Xword size = 0; + ELFIO::Elf_Half sect_idx = 0; + std::uint8_t bind = 0; + std::uint8_t type = 0; + std::uint8_t other = 0; + }; + + inline + Symbol read_symbol(const symbol_section_accessor& section, unsigned int idx) + { + assert(idx < section.get_symbols_num()); + + Symbol r; + section.get_symbol( + idx, r.name, r.value, r.size, r.bind, r.type, r.sect_idx, r.other); + + return r; + } + + template + inline + section* find_section_if(elfio& reader, P p) + { + const auto it = find_if( + reader.sections.begin(), reader.sections.end(), move(p)); + + return it != reader.sections.end() ? *it : nullptr; + } + vector copy_names_of_undefined_symbols( const symbol_section_accessor& section) { @@ -57,47 +89,57 @@ namespace for (auto i = 0u; i != section.get_symbols_num(); ++i) { // TODO: this is boyscout code, caching the temporaries // may be of worth. - string name; - Elf64_Addr value = 0; - Elf_Xword size = 0; - Elf_Half sect_idx = 0; - uint8_t bind = 0; - uint8_t type = 0; - uint8_t other = 0; - section.get_symbol( - i, name, value, size, bind, type, sect_idx, other); - - if (sect_idx == SHN_UNDEF && !name.empty()) { - r.push_back(std::move(name)); + auto tmp = read_symbol(section, i); + if (tmp.sect_idx == SHN_UNDEF && !tmp.name.empty()) { + r.push_back(std::move(tmp.name)); } } return r; } - pair find_symbol_address( - const symbol_section_accessor& section, - const string& symbol_name) + const std::unordered_map< + std::string, + std::pair>& symbol_addresses() { - static constexpr pair r{0, 0}; + static unordered_map> r; + static once_flag f; - for (auto i = 0u; i != section.get_symbols_num(); ++i) { - // TODO: this is boyscout code, caching the temporaries - // may be of worth. - string name; - Elf64_Addr value = 0; - Elf_Xword size = 0; - Elf_Half sect_idx = 0; - uint8_t bind = 0; - uint8_t type = 0; - uint8_t other = 0; + call_once(f, []() { + dl_iterate_phdr([](dl_phdr_info* info, size_t, void*) { + static constexpr const char self[] = "/proc/self/exe"; + elfio reader; - section.get_symbol( - i, name, value, size, bind, type, sect_idx, other); + static unsigned int iter = 0u; + if (reader.load(!iter ? self : info->dlpi_name)) { + auto it = find_section_if( + reader, [](const class section* x) { + return x->get_type() == SHT_SYMTAB; + }); - if (name == symbol_name) return make_pair(value, size); - } + if (it) { + const symbol_section_accessor symtab{reader, it}; + + for (auto i = 0u; i != symtab.get_symbols_num(); ++i) { + auto tmp = read_symbol(symtab, i); + + if (tmp.type == STT_OBJECT && + tmp.sect_idx != SHN_UNDEF) { + const auto addr = + tmp.value + (iter ? info->dlpi_addr : 0); + r.emplace( + move(tmp.name), make_pair(addr, tmp.size)); + } + } + } + + ++iter; + } + + return 0; + }, nullptr); + }); return r; } @@ -116,55 +158,43 @@ namespace symbol_section_accessor{reader, code_object_dynsym}); for (auto&& x : undefined_symbols) { - const auto tmp = find_symbol_address( - symbol_section_accessor{self_reader, process_symtab}, x); + using RAII_global = + unique_ptr; - if (!tmp.first) { - throw runtime_error{ - "The global variable: " + x + ", could not be found."}; + static unordered_map globals; + static once_flag f; + call_once(f, [=]() { globals.reserve(symbol_addresses().size()); }); + + if (globals.find(x) != globals.cend()) return; + + const auto it1 = symbol_addresses().find(x); + + if (it1 == symbol_addresses().cend()) { + throw runtime_error{"Global symbol: " + x + " is undefined."}; } - static unordered_map< - Elf64_Addr, - unique_ptr> globals; + static mutex mtx; + lock_guard lck{mtx}; - if (globals.count(tmp.first) == 0) { - void* p = nullptr; - hsa_amd_memory_lock( - reinterpret_cast(tmp.first), - tmp.second, - &agent, - 1, - &p); + if (globals.find(x) != globals.cend()) return; - static mutex mtx; + void* p = nullptr; + hsa_amd_memory_lock( + reinterpret_cast(it1->second.first), + it1->second.second, + nullptr, // All agents. + 0, + &p); - lock_guard lck{mtx}; - globals.emplace( - piecewise_construct, - make_tuple(tmp.first), - make_tuple(p, hsa_amd_memory_unlock)); - } - - const auto it = globals.find(tmp.first); - - assert(it != globals.cend()); + if (!p) { cerr << it1->first << endl; assert(false); } hsa_executable_agent_global_variable_define( - executable, agent, x.c_str(), it->second.get()); + executable, agent, x.c_str(), p); + + globals.emplace(x, RAII_global{p, hsa_amd_memory_unlock}); } } - template - inline - section* find_section_if(elfio& reader, P p) - { - const auto it = find_if( - reader.sections.begin(), reader.sections.end(), std::move(p)); - - return it != reader.sections.end() ? *it : nullptr; - } - vector code_object_blob_for_process() { static constexpr const char self[] = "/proc/self/exe"; @@ -217,8 +247,8 @@ namespace Bundled_code_header tmp{blob}; if (valid(tmp)) { for (auto&& bundle : bundles(tmp)) { - r[triple_to_hsa_isa(bundle.triple)] - .push_back(bundle.blob); + r[triple_to_hsa_isa(bundle.triple)].push_back( + bundle.blob); } } } @@ -233,24 +263,15 @@ namespace vector> r; symbol_section_accessor symbols{reader, symtab}; - auto foo = reader.get_entry(); - for (auto i = 0u; i != symbols.get_symbols_num(); ++i) { // TODO: this is boyscout code, caching the temporaries // may be of worth. - string name; - Elf64_Addr value = 0; - Elf_Xword size = 0; - Elf_Half sect_idx = 0; - uint8_t bind = 0; - uint8_t type = 0; - uint8_t other = 0; + auto tmp = read_symbol(symbols, i); - symbols.get_symbol( - i, name, value, size, bind, type, sect_idx, other); - - if (type == STT_FUNC && sect_idx != SHN_UNDEF && !name.empty()) { - r.emplace_back(value, name); + if (tmp.type == STT_FUNC && + tmp.sect_idx != SHN_UNDEF && + !tmp.name.empty()) { + r.emplace_back(tmp.value, tmp.name); } } @@ -417,7 +438,7 @@ namespace hip_impl for (auto&& acc : accelerators) { auto agent = static_cast(acc.get_hsa_agent()); - if (!agent) continue; + if (!agent || !acc.is_hsa_accelerator()) continue; hsa_agent_iterate_isas(*agent, [](hsa_isa_t x, void* pa) { const auto it = code_object_blobs().find(x); From 4131b47134bbffd1d386859c72fcaa41878cf026 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Tue, 21 Nov 2017 17:52:01 +0000 Subject: [PATCH 11/35] Modify the set component of the memcpy test (unclear why there is a memset component to begin with). --- tests/src/deviceLib/hipDeviceMemcpy.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/src/deviceLib/hipDeviceMemcpy.cpp b/tests/src/deviceLib/hipDeviceMemcpy.cpp index 46656a434c..e845ae8f2f 100644 --- a/tests/src/deviceLib/hipDeviceMemcpy.cpp +++ b/tests/src/deviceLib/hipDeviceMemcpy.cpp @@ -23,7 +23,7 @@ __global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In) __global__ void set(hipLaunchParm lp, uint32_t *ptr, uint8_t val, size_t size) { int tx = threadIdx.x; - memset(ptr + tx, val, (sizeof(uint32_t)*(size/LEN))); + memset(ptr + tx, val, sizeof(uint32_t)); } int main() From 08f252e4bfe4a57b3918cad0981289f24e5f1b9d Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 22 Nov 2017 19:37:03 +0000 Subject: [PATCH 12/35] Remove leftover comment. --- include/hip/hcc_detail/grid_launch_GGL.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/hip/hcc_detail/grid_launch_GGL.hpp b/include/hip/hcc_detail/grid_launch_GGL.hpp index e3fa3331ac..7a9500f4d6 100644 --- a/include/hip/hcc_detail/grid_launch_GGL.hpp +++ b/include/hip/hcc_detail/grid_launch_GGL.hpp @@ -125,7 +125,7 @@ void hipLaunchKernelGGL( std::size_t kernarg_size = kernarg.size(); void* config[] = { - HIP_LAUNCH_PARAM_BUFFER_POINTER, kernarg.data(),//&kernarg, + HIP_LAUNCH_PARAM_BUFFER_POINTER, kernarg.data(), HIP_LAUNCH_PARAM_BUFFER_SIZE, &kernarg_size, HIP_LAUNCH_PARAM_END }; From 9d47a4d9800d4e472be647373361503888e897e8 Mon Sep 17 00:00:00 2001 From: Chris Kitching Date: Mon, 13 Nov 2017 17:20:07 +0000 Subject: [PATCH 13/35] Add hipify mappings for all CUDA headers that have HIP equivalents I'm particularly running into issues with `device_types.h` in real CUDA code... --- hipify-clang/src/CUDA2HipMap.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/hipify-clang/src/CUDA2HipMap.cpp b/hipify-clang/src/CUDA2HipMap.cpp index de6ddb2d74..b2e5251139 100644 --- a/hipify-clang/src/CUDA2HipMap.cpp +++ b/hipify-clang/src/CUDA2HipMap.cpp @@ -312,9 +312,16 @@ const std::map CUDA_TYPE_NAME_MAP{ /// Maps cuda header names to hip header names. const std::map CUDA_INCLUDE_MAP{ // CUDA includes - {"cuda.h", {"hip/hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H, API_DRIVER}}, - {"cuda_runtime.h", {"hip/hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H, API_RUNTIME}}, - {"cuda_runtime_api.h", {"hip/hip_runtime_api.h", CONV_INCLUDE, API_RUNTIME}}, + {"cuda.h", {"hip/hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H, API_DRIVER}}, + {"cuda_runtime.h", {"hip/hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H, API_RUNTIME}}, + {"cuda_runtime_api.h", {"hip/hip_runtime_api.h", CONV_INCLUDE, API_RUNTIME}}, + {"channel_descriptor.h", {"hip/channel_descriptor.h", CONV_INCLUDE, API_RUNTIME}}, + {"device_functions.h", {"hip/device_functions.h", CONV_INCLUDE, API_RUNTIME}}, + {"driver_types.h", {"hip/driver_types.h", CONV_INCLUDE, API_RUNTIME}}, + {"cuComplex.h", {"hip/hip_complex.h", CONV_INCLUDE, API_RUNTIME}}, + {"cuda_fp16.h", {"hip/hip_fp16.h", CONV_INCLUDE, API_RUNTIME}}, + {"cuda_texture_types.h", {"hip/hip_texture_types.h", CONV_INCLUDE, API_RUNTIME}}, + {"vector_types.h", {"hip/hip_vector_types.h", CONV_INCLUDE, API_RUNTIME}}, // CUBLAS includes {"cublas.h", {"hipblas.h", CONV_INCLUDE, API_BLAS}}, From a401ce6e5d5885e89e168c1ce838c37afdad09aa Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Fri, 17 Nov 2017 16:00:28 +0000 Subject: [PATCH 14/35] This fixes some outright quaint choices made when implementing HIP's bitwise conversion functions, by using simple reinterpret_casts, as is idiomatic. These functions are supposed to be re-entrant, correct and efficient. Sadly, they were neither: they hid a massive race condition against a value stored in global memory, which means that they were also unreasonably slow if they ever managed to be correct, and relied on union based type punning which is in a grey area of the standard. It is difficult to ascertain what may have been the reason for coming up with this quirky solution. --- src/device_functions.cpp | 53 +++++++++------------------------------- 1 file changed, 12 insertions(+), 41 deletions(-) diff --git a/src/device_functions.cpp b/src/device_functions.cpp index 615ae4d0b7..a66cc1e9fb 100644 --- a/src/device_functions.cpp +++ b/src/device_functions.cpp @@ -23,27 +23,6 @@ THE SOFTWARE. #include #include "device_util.h" -struct holder64Bit{ - union{ - double d; - unsigned long int uli; - signed long int sli; - signed int si[2]; - unsigned int ui[2]; - }; -} __attribute__((aligned(8))); - -struct holder32Bit { - union { - float f; - unsigned int ui; - signed int si; - }; -} __attribute__((aligned(4))); - -__device__ struct holder64Bit hold64; -__device__ struct holder32Bit hold32; - __device__ float __double2float_rd(double x) { return (double)x; @@ -64,13 +43,11 @@ __device__ float __double2float_rz(double x) __device__ int __double2hiint(double x) { - hold64.d = x; - return hold64.si[1]; + return reinterpret_cast(x)[1]; } __device__ int __double2loint(double x) { - hold64.d = x; - return hold64.si[0]; + return reinterpret_cast(x)[0]; } @@ -145,8 +122,7 @@ __device__ unsigned long long int __double2ull_rz(double x) __device__ long long int __double_as_longlong(double x) { - hold64.d = x; - return hold64.sli; + return reinterpret_cast(x); } __device__ int __float2int_rd(float x) @@ -219,19 +195,17 @@ __device__ unsigned long long int __float2ull_rz(float x) __device__ int __float_as_int(float x) { - hold32.f = x; - return hold32.si; + return reinterpret_cast(x); } __device__ unsigned int __float_as_uint(float x) { - hold32.f = x; - return hold32.ui; + return reinterpret_cast(x); } __device__ double __hiloint2double(int hi, int lo) -{ - hold64.si[1] = hi; - hold64.si[0] = lo; - return hold64.d; +{ // TODO: this matches the original in not considering endianness, is that + // correct though? + int tmp[] = {lo, hi}; + return reinterpret_cast(tmp); } __device__ double __int2double_rn(int x) { @@ -257,8 +231,7 @@ __device__ float __int2float_rz(int x) __device__ float __int_as_float(int x) { - hold32.si = x; - return hold32.f; + return reinterpret_cast(x); } __device__ double __ll2double_rd(long long int x) @@ -297,8 +270,7 @@ __device__ float __ll2float_rz(long long int x) __device__ double __longlong_as_double(long long int x) { - hold64.sli = x; - return hold64.d; + return reinterpret_cast(x); } __device__ double __uint2double_rn(int x) @@ -325,8 +297,7 @@ __device__ float __uint2float_rz(unsigned int x) __device__ float __uint_as_float(unsigned int x) { - hold32.ui = x; - return hold32.f; + return reinterpret_cast(x); } __device__ double __ull2double_rd(unsigned long long int x) From a6ccaf3d5780f2054055742b76cf1c616aef49a3 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Sat, 18 Nov 2017 01:16:31 +0000 Subject: [PATCH 15/35] This actually (tries) to do the right thing all the way, by using memcpy for bitcasting, and not rely on undefined behaviour of a different flavour as a substitute for the original undefined behaviour. Note that the compiler will (should) optimise down to the same emitted code, since this is a pattern it understands. --- src/device_functions.cpp | 71 ++++++++++++++++++++++++++++++++-------- 1 file changed, 58 insertions(+), 13 deletions(-) diff --git a/src/device_functions.cpp b/src/device_functions.cpp index a66cc1e9fb..63425bc9f4 100644 --- a/src/device_functions.cpp +++ b/src/device_functions.cpp @@ -43,11 +43,21 @@ __device__ float __double2float_rz(double x) __device__ int __double2hiint(double x) { - return reinterpret_cast(x)[1]; + static_assert(sizeof(double) == 2 * sizeof(int), ""); + + int tmp[2]; + __builtin_memcpy(tmp, &x, sizeof(tmp)); + + return tmp[1]; } __device__ int __double2loint(double x) { - return reinterpret_cast(x)[0]; + static_assert(sizeof(double) == 2 * sizeof(int), ""); + + int tmp[2]; + __builtin_memcpy(tmp, &x, sizeof(tmp)); + + return tmp[0]; } @@ -122,7 +132,12 @@ __device__ unsigned long long int __double2ull_rz(double x) __device__ long long int __double_as_longlong(double x) { - return reinterpret_cast(x); + static_assert(sizeof(long long) == sizeof(double), ""); + + long long tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return tmp; } __device__ int __float2int_rd(float x) @@ -195,17 +210,32 @@ __device__ unsigned long long int __float2ull_rz(float x) __device__ int __float_as_int(float x) { - return reinterpret_cast(x); + static_assert(sizeof(int) == sizeof(float), ""); + + int tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return tmp; } __device__ unsigned int __float_as_uint(float x) { - return reinterpret_cast(x); + static_assert(sizeof(unsigned int) == sizeof(float), ""); + + unsigned int tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return tmp; } -__device__ double __hiloint2double(int hi, int lo) -{ // TODO: this matches the original in not considering endianness, is that - // correct though? - int tmp[] = {lo, hi}; - return reinterpret_cast(tmp); +__device__ double __hiloint2double(int32_t hi, int32_t lo) +{ + static_assert(sizeof(double) == sizeof(uint64_t), ""); + + uint64_t tmp0 = + (static_cast(hi) << 32ull) | static_cast(lo); + double tmp1; + __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + + return tmp1; } __device__ double __int2double_rn(int x) { @@ -231,7 +261,12 @@ __device__ float __int2float_rz(int x) __device__ float __int_as_float(int x) { - return reinterpret_cast(x); + static_assert(sizeof(float) == sizeof(int), ""); + + float tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return tmp; } __device__ double __ll2double_rd(long long int x) @@ -270,7 +305,12 @@ __device__ float __ll2float_rz(long long int x) __device__ double __longlong_as_double(long long int x) { - return reinterpret_cast(x); + static_assert(sizeof(double) == sizeof(long long), ""); + + double tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return x; } __device__ double __uint2double_rn(int x) @@ -297,7 +337,12 @@ __device__ float __uint2float_rz(unsigned int x) __device__ float __uint_as_float(unsigned int x) { - return reinterpret_cast(x); + static_assert(sizeof(float) == sizeof(unsigned int), ""); + + float tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return tmp; } __device__ double __ull2double_rd(unsigned long long int x) From 265c3b224e251ecbb5b5be5e45b3ee792f2036af Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Thu, 23 Nov 2017 09:57:24 +0530 Subject: [PATCH 16/35] Fix float2int rounding functions Change-Id: I67943859a6344c5eec0eaa23418c9b802ef72468 --- src/device_functions.cpp | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/device_functions.cpp b/src/device_functions.cpp index 63425bc9f4..6f91e3c939 100644 --- a/src/device_functions.cpp +++ b/src/device_functions.cpp @@ -23,6 +23,27 @@ THE SOFTWARE. #include #include "device_util.h" +struct holder64Bit{ + union{ + double d; + unsigned long int uli; + signed long int sli; + signed int si[2]; + unsigned int ui[2]; + }; +} __attribute__((aligned(8))); + +struct holder32Bit { + union { + float f; + unsigned int ui; + signed int si; + }; +} __attribute__((aligned(4))); + +__device__ struct holder64Bit hold64; +__device__ struct holder32Bit hold32; + __device__ float __double2float_rd(double x) { return (double)x; From 02c2bfc7eff1958190995f3cd003771328252315 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Tue, 28 Nov 2017 19:15:29 +0000 Subject: [PATCH 17/35] Re-sync with upstream and re-factor platform global management for texture references. --- include/hip/hcc_detail/program_state.hpp | 4 ++++ src/hip_module.cpp | 6 +++--- src/program_state.cpp | 24 ++++++++++++------------ 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/include/hip/hcc_detail/program_state.hpp b/include/hip/hcc_detail/program_state.hpp index 0e21b12f5f..65896e97a7 100644 --- a/include/hip/hcc_detail/program_state.hpp +++ b/include/hip/hcc_detail/program_state.hpp @@ -23,6 +23,7 @@ THE SOFTWARE. #pragma once #include +#include #include #include @@ -68,12 +69,15 @@ namespace hip_impl } }; + using RAII_global = std::unique_ptr; + const std::unordered_map< hsa_agent_t, std::vector>& executables(); const std::unordered_map< std::uintptr_t, std::vector>>& functions(); const std::unordered_map& function_names(); + std::unordered_map& globals(); hsa_executable_t load_executable( hsa_executable_t executable, hsa_agent_t agent, std::istream& file); diff --git a/src/hip_module.cpp b/src/hip_module.cpp index d8fa2db097..00ffd8b03b 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -725,9 +725,9 @@ hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const if(name == NULL || hmod == NULL){ ret = hipErrorNotInitialized; } else{ - const auto it = hmod->coGlobals.find(name); - if (it == hmod->coGlobals.end()) return ihipLogStatus(hipErrorInvalidValue); - *texRef = reinterpret_cast(it->second); + const auto it = hip_impl::globals().find(name); + if (it == hip_impl::globals().end()) return ihipLogStatus(hipErrorInvalidValue); + *texRef = reinterpret_cast(it->second.get()); ret = hipSuccess; } } diff --git a/src/program_state.cpp b/src/program_state.cpp index 79d692e06f..61c90556be 100644 --- a/src/program_state.cpp +++ b/src/program_state.cpp @@ -158,14 +158,7 @@ namespace symbol_section_accessor{reader, code_object_dynsym}); for (auto&& x : undefined_symbols) { - using RAII_global = - unique_ptr; - - static unordered_map globals; - static once_flag f; - call_once(f, [=]() { globals.reserve(symbol_addresses().size()); }); - - if (globals.find(x) != globals.cend()) return; + if (globals().find(x) != globals().cend()) return; const auto it1 = symbol_addresses().find(x); @@ -176,7 +169,7 @@ namespace static mutex mtx; lock_guard lck{mtx}; - if (globals.find(x) != globals.cend()) return; + if (globals().find(x) != globals().cend()) return; void* p = nullptr; hsa_amd_memory_lock( @@ -186,12 +179,10 @@ namespace 0, &p); - if (!p) { cerr << it1->first << endl; assert(false); } - hsa_executable_agent_global_variable_define( executable, agent, x.c_str(), p); - globals.emplace(x, RAII_global{p, hsa_amd_memory_unlock}); + globals().emplace(x, RAII_global{p, hsa_amd_memory_unlock}); } } @@ -534,6 +525,15 @@ namespace hip_impl return r; } + unordered_map& globals() + { + static unordered_map r; + static once_flag f; + call_once(f, []() { r.reserve(symbol_addresses().size()); }); + + return r; + } + hsa_executable_t load_executable( hsa_executable_t executable, hsa_agent_t agent, istream& file) { From 6e4ca3fbb4b8b4057ff9a636ffd2f1989126d097 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Tue, 28 Nov 2017 19:45:47 +0000 Subject: [PATCH 18/35] Change memset kernel to use memcpy instead of placement new. Simplify indexers. --- include/hip/hcc_detail/hip_runtime.h | 15 +++------------ src/hip_memory.cpp | 5 ++++- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/include/hip/hcc_detail/hip_runtime.h b/include/hip/hcc_detail/hip_runtime.h index c2302d4dc1..924e774af0 100644 --- a/include/hip/hcc_detail/hip_runtime.h +++ b/include/hip/hcc_detail/hip_runtime.h @@ -389,18 +389,9 @@ template(&f[idx]), + reinterpret_cast(&value), + sizeof(T)); idx += grid_dim; } } From 5aeb5dcd6ffe2f02058e89ed4d147268e0b5067f Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Tue, 28 Nov 2017 19:56:04 +0000 Subject: [PATCH 19/35] Remove leftover agent allocated globals. --- src/device_functions.cpp | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/src/device_functions.cpp b/src/device_functions.cpp index b888bde5d5..396b5b0594 100644 --- a/src/device_functions.cpp +++ b/src/device_functions.cpp @@ -28,27 +28,6 @@ extern "C" float __ocml_rint_f32(float); extern "C" float __ocml_ceil_f32(float); extern "C" float __ocml_trunc_f32(float); -struct holder64Bit{ - union{ - double d; - unsigned long int uli; - signed long int sli; - signed int si[2]; - unsigned int ui[2]; - }; -} __attribute__((aligned(8))); - -struct holder32Bit { - union { - float f; - unsigned int ui; - signed int si; - }; -} __attribute__((aligned(4))); - -__device__ struct holder64Bit hold64; -__device__ struct holder32Bit hold32; - __device__ float __double2float_rd(double x) { return (double)x; From 89e9399427d41ad20b564a7d5bb66b78ffc6220d Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 29 Nov 2017 00:17:44 +0000 Subject: [PATCH 20/35] Choose whether or not to use functional grid_launch based on the version of HCC used to compile. --- include/hip/hcc_detail/code_object_bundle.hpp | 22 + include/hip/hcc_detail/concepts.hpp | 2 +- .../hip/hcc_detail/functional_grid_launch.hpp | 159 +++ include/hip/hcc_detail/grid_launch_GGL.hpp | 146 +-- .../hcc_detail/macro_based_grid_launch.hpp | 1004 +++++++++++++++++ src/functional_grid_launch.inl | 138 +++ src/grid_launch.cpp | 121 +- src/hip_module.cpp | 1 + src/macro_based_grid_launch.inl | 99 ++ 9 files changed, 1436 insertions(+), 256 deletions(-) create mode 100644 include/hip/hcc_detail/functional_grid_launch.hpp create mode 100644 include/hip/hcc_detail/macro_based_grid_launch.hpp create mode 100644 src/functional_grid_launch.inl create mode 100644 src/macro_based_grid_launch.inl diff --git a/include/hip/hcc_detail/code_object_bundle.hpp b/include/hip/hcc_detail/code_object_bundle.hpp index 080132c561..05ba44fcc8 100644 --- a/include/hip/hcc_detail/code_object_bundle.hpp +++ b/include/hip/hcc_detail/code_object_bundle.hpp @@ -1,3 +1,25 @@ +/* +Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + #pragma once #include diff --git a/include/hip/hcc_detail/concepts.hpp b/include/hip/hcc_detail/concepts.hpp index 5c50f5d577..18c1119b73 100644 --- a/include/hip/hcc_detail/concepts.hpp +++ b/include/hip/hcc_detail/concepts.hpp @@ -1,5 +1,5 @@ /* -Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/include/hip/hcc_detail/functional_grid_launch.hpp b/include/hip/hcc_detail/functional_grid_launch.hpp new file mode 100644 index 0000000000..bbffae52e8 --- /dev/null +++ b/include/hip/hcc_detail/functional_grid_launch.hpp @@ -0,0 +1,159 @@ +/* +Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#include "code_object_bundle.hpp" +#include "concepts.hpp" +#include "helpers.hpp" +#include "program_state.hpp" + +#include "hc.hpp" +#include "hip/hip_hcc.h" +#include "hip_runtime.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace hip_impl +{ + template< + typename T, + typename std::enable_if{}>::type* = nullptr> + inline + T round_up_to_next_multiple_nonnegative(T x, T y) + { + T tmp = x + y - 1; + return tmp - tmp % y; + } + + inline + std::vector make_kernarg() + { + return {}; + } + + inline + std::vector make_kernarg(std::vector kernarg) + { + return kernarg; + } + + template + inline + std::vector make_kernarg(std::vector kernarg, T x) + { + kernarg.resize( + round_up_to_next_multiple_nonnegative(kernarg.size(), alignof(T)) + + sizeof(T)); + + new (kernarg.data() + kernarg.size() - sizeof(T)) T{std::move(x)}; + + return kernarg; + } + + template + inline + std::vector make_kernarg( + std::vector kernarg, T x, Ts... xs) + { + return make_kernarg( + make_kernarg(std::move(kernarg), std::move(x)), std::move(xs)...); + } + + template + inline + std::vector make_kernarg(Ts... xs) + { + std::vector kernarg; + kernarg.reserve(sizeof(std::tuple)); + + return make_kernarg(std::move(kernarg), std::move(xs)...); + } + + void hipLaunchKernelGGLImpl( + std::uintptr_t function_address, + const dim3& numBlocks, + const dim3& dimBlocks, + std::uint32_t sharedMemBytes, + hipStream_t stream, + void** kernarg); +} // Namespace hip_impl. + +template +inline +void hipLaunchKernelGGL( + F kernel, + const dim3& numBlocks, + const dim3& dimBlocks, + std::uint32_t sharedMemBytes, + hipStream_t stream, + Args... args) +{ + auto kernarg = hip_impl::make_kernarg(std::move(args)...); + std::size_t kernarg_size = kernarg.size(); + + void* config[] = { + HIP_LAUNCH_PARAM_BUFFER_POINTER, kernarg.data(), + HIP_LAUNCH_PARAM_BUFFER_SIZE, &kernarg_size, + HIP_LAUNCH_PARAM_END + }; + + hip_impl::hipLaunchKernelGGLImpl( + reinterpret_cast(kernel), + numBlocks, + dimBlocks, + sharedMemBytes, + stream, + &config[0]); +} + +template +inline +void hipLaunchKernel( + F kernel, + const dim3& numBlocks, + const dim3& dimBlocks, + std::uint32_t groupMemBytes, + hipStream_t stream, + Args... args) +{ + hipLaunchKernelGGL( + kernel, + numBlocks, + dimBlocks, + groupMemBytes, + stream, + hipLaunchParm{}, + std::move(args)...); +} + diff --git a/include/hip/hcc_detail/grid_launch_GGL.hpp b/include/hip/hcc_detail/grid_launch_GGL.hpp index 7a9500f4d6..187d84dbff 100644 --- a/include/hip/hcc_detail/grid_launch_GGL.hpp +++ b/include/hip/hcc_detail/grid_launch_GGL.hpp @@ -20,143 +20,11 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#pragma once - #if GENERIC_GRID_LAUNCH == 1 - -#include "code_object_bundle.hpp" -#include "concepts.hpp" -#include "helpers.hpp" -#include "program_state.hpp" - -#include "hc.hpp" -#include "hip/hip_hcc.h" -#include "hip_runtime.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace hip_impl -{ - template< - typename T, - typename std::enable_if{}>::type* = nullptr> - inline - T round_up_to_next_multiple_nonnegative(T x, T y) - { - T tmp = x + y - 1; - return tmp - tmp % y; - } - - inline - std::vector make_kernarg() - { - return {}; - } - - inline - std::vector make_kernarg(std::vector kernarg) - { - return kernarg; - } - - template - inline - std::vector make_kernarg(std::vector kernarg, T x) - { - kernarg.resize( - round_up_to_next_multiple_nonnegative(kernarg.size(), alignof(T)) + - sizeof(T)); - - new (kernarg.data() + kernarg.size() - sizeof(T)) T{std::move(x)}; - - return kernarg; - } - - template - inline - std::vector make_kernarg( - std::vector kernarg, T x, Ts... xs) - { - return make_kernarg( - make_kernarg(std::move(kernarg), std::move(x)), std::move(xs)...); - } - - template - inline - std::vector make_kernarg(Ts... xs) - { - std::vector kernarg; - kernarg.reserve(sizeof(std::tuple)); - - return make_kernarg(std::move(kernarg), std::move(xs)...); - } - - void hipLaunchKernelGGLImpl( - std::uintptr_t function_address, - const dim3& numBlocks, - const dim3& dimBlocks, - std::uint32_t sharedMemBytes, - hipStream_t stream, - void** kernarg); -} // Namespace hip_impl. - -template -inline -void hipLaunchKernelGGL( - F kernel, - const dim3& numBlocks, - const dim3& dimBlocks, - std::uint32_t sharedMemBytes, - hipStream_t stream, - Args... args) -{ - auto kernarg = hip_impl::make_kernarg(std::move(args)...); - std::size_t kernarg_size = kernarg.size(); - - void* config[] = { - HIP_LAUNCH_PARAM_BUFFER_POINTER, kernarg.data(), - HIP_LAUNCH_PARAM_BUFFER_SIZE, &kernarg_size, - HIP_LAUNCH_PARAM_END - }; - - hip_impl::hipLaunchKernelGGLImpl( - reinterpret_cast(kernel), - numBlocks, - dimBlocks, - sharedMemBytes, - stream, - &config[0]); -} - -template -inline -void hipLaunchKernel( - F kernel, - const dim3& numBlocks, - const dim3& dimBlocks, - std::uint32_t groupMemBytes, - hipStream_t stream, - Args... args) -{ - hipLaunchKernelGGL( - kernel, - numBlocks, - dimBlocks, - groupMemBytes, - stream, - hipLaunchParm{}, - std::move(args)...); -} - -#endif //GENERIC_GRID_LAUNCH + #if __hcc_workweek__ >= 17481 + #define FUNCTIONAL_GRID_LAUNCH + #include "functional_grid_launch.hpp" + #else + #include "macro_based_grid_launch.hpp" + #endif +#endif //GENERIC_GRID_LAUNCH \ No newline at end of file diff --git a/include/hip/hcc_detail/macro_based_grid_launch.hpp b/include/hip/hcc_detail/macro_based_grid_launch.hpp new file mode 100644 index 0000000000..f1dfe76245 --- /dev/null +++ b/include/hip/hcc_detail/macro_based_grid_launch.hpp @@ -0,0 +1,1004 @@ +/* +Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#include "concepts.hpp" +#include "helpers.hpp" + +#include "hc.hpp" +#include "hip/hip_hcc.h" +#include "hip_runtime.h" + +#include +#include +#include +#include +#include + +namespace hip_impl +{ + namespace + { + struct New_grid_launch_tag {}; + struct Old_grid_launch_tag {}; + + template + class RAII_guard { + D dtor_; + public: + RAII_guard() = default; + + RAII_guard(const C& ctor, D dtor) : dtor_{std::move(dtor)} + { + ctor(); + } + + RAII_guard(const RAII_guard&) = default; + RAII_guard(RAII_guard&&) = default; + + RAII_guard& operator=(const RAII_guard&) = default; + RAII_guard& operator=(RAII_guard&&) = default; + + ~RAII_guard() { dtor_(); } + }; + + template + RAII_guard make_RAII_guard(const C& ctor, D dtor) + { + return RAII_guard{ctor, std::move(dtor)}; + } + + template + using is_new_grid_launch_t = typename std::conditional< + is_callable{}, + New_grid_launch_tag, + Old_grid_launch_tag>::type; + } + + // TODO: - dispatch rank should be derived from the domain dimensions passed + // in, and not always assumed to be 3; + + template + requires(Domain == {Ts...}) + inline + void grid_launch_hip_impl_( + New_grid_launch_tag, + dim3 num_blocks, + dim3 dim_blocks, + int group_mem_bytes, + const hc::accelerator_view& acc_v, + K k) + { + const auto d = hc::extent<3>{ + num_blocks.z * dim_blocks.z, + num_blocks.y * dim_blocks.y, + num_blocks.x * dim_blocks.x}.tile_with_dynamic( + dim_blocks.z, + dim_blocks.y, + dim_blocks.x, + group_mem_bytes); + + try { + hc::parallel_for_each(acc_v, d, k); + } + catch (std::exception& ex) { + std::cerr << "Failed in " << __func__ << ", with exception: " + << ex.what() << std::endl; + throw; + } + } + + // TODO: these are workarounds, they should be removed. + + hc::accelerator_view lock_stream_hip_(hipStream_t&, void*&); + void print_prelaunch_trace_(const char*, dim3, dim3, int, hipStream_t); + void unlock_stream_hip_( + hipStream_t, void*, const char*, hc::accelerator_view*); + + template + requires(Domain == {Ts...}) + inline + void grid_launch_hip_impl_( + New_grid_launch_tag, + dim3 num_blocks, + dim3 dim_blocks, + int group_mem_bytes, + hipStream_t stream, + const char* kernel_name, + K k) + { + void* lck_stream = nullptr; + auto acc_v = lock_stream_hip_(stream, lck_stream); + auto stream_guard = make_RAII_guard( + std::bind( + print_prelaunch_trace_, + kernel_name, + num_blocks, + dim_blocks, + group_mem_bytes, + stream), + std::bind( + unlock_stream_hip_, stream, lck_stream, kernel_name, &acc_v)); + + try { + grid_launch_hip_impl_( + New_grid_launch_tag{}, + std::move(num_blocks), + std::move(dim_blocks), + group_mem_bytes, + acc_v, + std::move(k)); + } + catch (std::exception& ex) { + std::cerr << "Failed in " << __func__ << ", with exception: " + << ex.what() << std::endl; + throw; + } + } + + template + requires(Domain == {hipLaunchParm, Ts...}) + inline + void grid_launch_hip_impl_( + Old_grid_launch_tag, + dim3 num_blocks, + dim3 dim_blocks, + int group_mem_bytes, + hipStream_t stream, + K k) + { + grid_launch_hip_impl_( + New_grid_launch_tag{}, + std::move(num_blocks), + std::move(dim_blocks), + group_mem_bytes, + std::move(stream), + std::move(k)); + } + + template + requires(Domain == {hipLaunchParm, Ts...}) + inline + void grid_launch_hip_impl_( + Old_grid_launch_tag, + dim3 num_blocks, + dim3 dim_blocks, + int group_mem_bytes, + hipStream_t stream, + const char* kernel_name, + K k) + { + grid_launch_hip_impl_( + New_grid_launch_tag{}, + std::move(num_blocks), + std::move(dim_blocks), + group_mem_bytes, + std::move(stream), + kernel_name, + std::move(k)); + } + + template + requires(Domain == {Ts...}) + inline + std::enable_if_t::value> grid_launch_hip_( + dim3 num_blocks, + dim3 dim_blocks, + int group_mem_bytes, + hipStream_t stream, + const char* kernel_name, + K k) + { + grid_launch_hip_impl_( + is_new_grid_launch_t{}, + std::move(num_blocks), + std::move(dim_blocks), + group_mem_bytes, + std::move(stream), + kernel_name, + std::move(k)); + } + + template + requires(Domain == {Ts...}) + inline + std::enable_if_t::value> grid_launch_hip_( + dim3 num_blocks, + dim3 dim_blocks, + int group_mem_bytes, + hipStream_t stream, + K k) + { + grid_launch_hip_impl_( + is_new_grid_launch_t{}, + std::move(num_blocks), + std::move(dim_blocks), + group_mem_bytes, + std::move(stream), + std::move(k)); + } + + // TODO: these are temporary and purposefully noisy and disruptive. + #define make_kernel_name_hip(k, n)\ + HIP_kernel_functor_name_begin ## _ ## k ## _ ## \ + HIP_kernel_functor_name_end ## _ ## n + + #define make_kernel_functor_hip_30(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ + p24, p25, p26, p27)\ + struct make_kernel_name_hip(function_name, 28) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + std::decay_t _p22_;\ + std::decay_t _p23_;\ + std::decay_t _p24_;\ + std::decay_t _p25_;\ + std::decay_t _p26_;\ + std::decay_t _p27_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_,\ + _p26_, _p27_);\ + }\ + } + #define make_kernel_functor_hip_29(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ + p24, p25, p26)\ + struct make_kernel_name_hip(function_name, 27) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + std::decay_t _p22_;\ + std::decay_t _p23_;\ + std::decay_t _p24_;\ + std::decay_t _p25_;\ + std::decay_t _p26_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_,\ + _p26_);\ + }\ + } + #define make_kernel_functor_hip_28(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ + p24, p25)\ + struct make_kernel_name_hip(function_name, 26) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + std::decay_t _p22_;\ + std::decay_t _p23_;\ + std::decay_t _p24_;\ + std::decay_t _p25_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_);\ + }\ + } + #define make_kernel_functor_hip_27(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ + p24)\ + struct make_kernel_name_hip(function_name, 25) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + std::decay_t _p22_;\ + std::decay_t _p23_;\ + std::decay_t _p24_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_);\ + }\ + } + #define make_kernel_functor_hip_26(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23)\ + struct make_kernel_name_hip(function_name, 24) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + std::decay_t _p22_;\ + std::decay_t _p23_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_, _p22_, _p23_);\ + }\ + } + #define make_kernel_functor_hip_25(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22)\ + struct make_kernel_name_hip(function_name, 23) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + std::decay_t _p22_;\ + __attribute__((used, flatten))\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_, _p22_);\ + }\ + } + #define make_kernel_functor_hip_24(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21)\ + struct make_kernel_name_hip(function_name, 22) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_);\ + }\ + } + #define make_kernel_functor_hip_23(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20)\ + struct make_kernel_name_hip(function_name, 21) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_);\ + }\ + } + #define make_kernel_functor_hip_22(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19)\ + struct make_kernel_name_hip(function_name, 20) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_);\ + }\ + } + #define make_kernel_functor_hip_21(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18)\ + struct make_kernel_name_hip(function_name, 19) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_);\ + }\ + } + #define make_kernel_functor_hip_20(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17)\ + struct make_kernel_name_hip(function_name, 18) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_);\ + }\ + } + #define make_kernel_functor_hip_19(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16)\ + struct make_kernel_name_hip(function_name, 17) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_);\ + }\ + } + #define make_kernel_functor_hip_18(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15)\ + struct make_kernel_name_hip(function_name, 16) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_);\ + }\ + } + #define make_kernel_functor_hip_17(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14)\ + struct make_kernel_name_hip(function_name, 15) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_);\ + }\ + } + #define make_kernel_functor_hip_16(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13)\ + struct make_kernel_name_hip(function_name, 14) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_);\ + }\ + } + #define make_kernel_functor_hip_15(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12)\ + struct make_kernel_name_hip(function_name, 13) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_);\ + }\ + } + #define make_kernel_functor_hip_14(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11)\ + struct make_kernel_name_hip(function_name, 12) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_);\ + }\ + } + #define make_kernel_functor_hip_13(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10)\ + struct make_kernel_name_hip(function_name, 11) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_);\ + }\ + } + #define make_kernel_functor_hip_12(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9)\ + struct make_kernel_name_hip(function_name, 10) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ + _p9_);\ + }\ + } + #define make_kernel_functor_hip_11(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8)\ + struct make_kernel_name_hip(function_name, 9) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_);\ + }\ + } + #define make_kernel_functor_hip_10(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7)\ + struct make_kernel_name_hip(function_name, 8) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_);\ + }\ + } + #define make_kernel_functor_hip_9(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6)\ + struct make_kernel_name_hip(function_name, 7) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_);\ + }\ + } + #define make_kernel_functor_hip_8(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5)\ + struct make_kernel_name_hip(function_name, 6) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_);\ + }\ + } + #define make_kernel_functor_hip_7(\ + function_name, kernel_name, p0, p1, p2, p3, p4)\ + struct make_kernel_name_hip(function_name, 5) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_);\ + }\ + } + #define make_kernel_functor_hip_6(function_name, kernel_name, p0, p1, p2, p3)\ + struct make_kernel_name_hip(function_name, 4) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_, _p3_);\ + }\ + } + #define make_kernel_functor_hip_5(function_name, kernel_name, p0, p1, p2)\ + struct make_kernel_name_hip(function_name, 3) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_);\ + }\ + } + #define make_kernel_functor_hip_4(function_name, kernel_name, p0, p1)\ + struct make_kernel_name_hip(function_name, 2) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_);\ + }\ + } + #define fofo(f, n) kernel_prefix_hip ## f ## kernel_suffix_hip ## n + #define make_kernel_functor_hip_3(function_name, kernel_name, p0)\ + struct make_kernel_name_hip(function_name, 1) {\ + std::decay_t _p0_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_);\ + }\ + } + #define make_kernel_functor_hip_2(function_name, kernel_name)\ + struct make_kernel_name_hip(function_name, 0) {\ + void operator()(const hc::tiled_index<3>&) [[hc]]\ + {\ + return kernel_name(hipLaunchParm{});\ + }\ + } + #define make_kernel_functor_hip_1(...) + #define make_kernel_functor_hip_0(...) + #define make_kernel_functor_hip_(...)\ + overload_macro_hip_(make_kernel_functor_hip_, __VA_ARGS__) + + + #define hipLaunchNamedKernelGGL(\ + function_name,\ + kernel_name,\ + num_blocks,\ + dim_blocks,\ + group_mem_bytes,\ + stream,\ + ...)\ + do {\ + make_kernel_functor_hip_(function_name, kernel_name, __VA_ARGS__)\ + hip_kernel_functor_impl_{__VA_ARGS__};\ + hip_impl::grid_launch_hip_(\ + num_blocks,\ + dim_blocks,\ + group_mem_bytes,\ + stream,\ + #kernel_name,\ + hip_kernel_functor_impl_);\ + } while(0) + + #define hipLaunchKernelGGL(\ + kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)\ + do {\ + hipLaunchNamedKernelGGL(\ + unnamed,\ + kernel_name,\ + num_blocks,\ + dim_blocks,\ + group_mem_bytes,\ + stream,\ + ##__VA_ARGS__);\ + } while (0) + + #define hipLaunchKernel(\ + kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)\ + do {\ + hipLaunchKernelGGL(\ + kernel_name,\ + num_blocks,\ + dim_blocks,\ + group_mem_bytes,\ + stream,\ + hipLaunchParm{},\ + ##__VA_ARGS__);\ + } while(0) +} \ No newline at end of file diff --git a/src/functional_grid_launch.inl b/src/functional_grid_launch.inl new file mode 100644 index 0000000000..4a26f66c8c --- /dev/null +++ b/src/functional_grid_launch.inl @@ -0,0 +1,138 @@ +/* +Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "hip/hcc_detail/grid_launch_GGL.hpp" +#include "hip/hcc_detail/program_state.hpp" + +#include "hip/hip_runtime_api.h" + +// Internal header, do not percolate upwards. +#include "hip_hcc_internal.h" +#include "hc.hpp" +#include "trace_helper.h" + +#include +#include +#include +#include + +#include + +using namespace hc; +using namespace std; + +namespace hip_impl +{ + namespace + { + inline + string name(uintptr_t function_address) + { + const auto it = function_names().find(function_address); + + if (it == function_names().cend()) { + throw runtime_error{ + "Invalid function passed to hipLaunchKernelGGL."}; + } + + return it->second; + } + + inline + string name(hsa_agent_t agent) + { + char n[64] = {}; + hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, n); + + return string{n}; + } + + inline + hsa_agent_t target_agent(hipStream_t stream) + { + if (stream) { + return *static_cast( + stream->locked_getAv()->get_hsa_agent()); + } + else if ( + ihipGetTlsDefaultCtx() && ihipGetTlsDefaultCtx()->getDevice()) { + return ihipGetDevice( + ihipGetTlsDefaultCtx()->getDevice()->_deviceId)->_hsaAgent; + } + else { + return *static_cast( + accelerator{}.get_default_view().get_hsa_agent()); + } + } + } + + void hipLaunchKernelGGLImpl( + uintptr_t function_address, + const dim3& numBlocks, + const dim3& dimBlocks, + uint32_t sharedMemBytes, + hipStream_t stream, + void** kernarg) + { + const auto it0 = functions().find(function_address); + + if (it0 == functions().cend()) { + throw runtime_error{ + "No device code available for function: " + + name(function_address) + }; + } + + auto agent = target_agent(stream); + + const auto it1 = find_if( + it0->second.cbegin(), + it0->second.cend(), + [=](const pair& x) { + return x.first.handle == agent.handle; + }); + + if (it1 == it0->second.cend()) { + throw runtime_error{ + "No code available for function: " + name(function_address) + + ", for agent: " + name(agent) + }; + } + + for (auto&& agent_kernel : it0->second) { + if (agent.handle == agent_kernel.first.handle) { + hipModuleLaunchKernel( + agent_kernel.second, + numBlocks.x, + numBlocks.y, + numBlocks.z, + dimBlocks.x, + dimBlocks.y, + dimBlocks.z, + sharedMemBytes, + stream, + nullptr, + kernarg); + } + } + } +} diff --git a/src/grid_launch.cpp b/src/grid_launch.cpp index 4a26f66c8c..484d314fba 100644 --- a/src/grid_launch.cpp +++ b/src/grid_launch.cpp @@ -20,119 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "hip/hcc_detail/grid_launch_GGL.hpp" -#include "hip/hcc_detail/program_state.hpp" - -#include "hip/hip_runtime_api.h" - -// Internal header, do not percolate upwards. -#include "hip_hcc_internal.h" -#include "hc.hpp" -#include "trace_helper.h" - -#include -#include -#include -#include - -#include - -using namespace hc; -using namespace std; - -namespace hip_impl -{ - namespace - { - inline - string name(uintptr_t function_address) - { - const auto it = function_names().find(function_address); - - if (it == function_names().cend()) { - throw runtime_error{ - "Invalid function passed to hipLaunchKernelGGL."}; - } - - return it->second; - } - - inline - string name(hsa_agent_t agent) - { - char n[64] = {}; - hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, n); - - return string{n}; - } - - inline - hsa_agent_t target_agent(hipStream_t stream) - { - if (stream) { - return *static_cast( - stream->locked_getAv()->get_hsa_agent()); - } - else if ( - ihipGetTlsDefaultCtx() && ihipGetTlsDefaultCtx()->getDevice()) { - return ihipGetDevice( - ihipGetTlsDefaultCtx()->getDevice()->_deviceId)->_hsaAgent; - } - else { - return *static_cast( - accelerator{}.get_default_view().get_hsa_agent()); - } - } - } - - void hipLaunchKernelGGLImpl( - uintptr_t function_address, - const dim3& numBlocks, - const dim3& dimBlocks, - uint32_t sharedMemBytes, - hipStream_t stream, - void** kernarg) - { - const auto it0 = functions().find(function_address); - - if (it0 == functions().cend()) { - throw runtime_error{ - "No device code available for function: " + - name(function_address) - }; - } - - auto agent = target_agent(stream); - - const auto it1 = find_if( - it0->second.cbegin(), - it0->second.cend(), - [=](const pair& x) { - return x.first.handle == agent.handle; - }); - - if (it1 == it0->second.cend()) { - throw runtime_error{ - "No code available for function: " + name(function_address) + - ", for agent: " + name(agent) - }; - } - - for (auto&& agent_kernel : it0->second) { - if (agent.handle == agent_kernel.first.handle) { - hipModuleLaunchKernel( - agent_kernel.second, - numBlocks.x, - numBlocks.y, - numBlocks.z, - dimBlocks.x, - dimBlocks.y, - dimBlocks.z, - sharedMemBytes, - stream, - nullptr, - kernarg); - } - } - } -} +#if defined(FUNCTIONAL_GRID_LAUNCH) + #include "functional_grid_launch.inl" +#else + #include "macro_based_grid_launch.inl" +#endif \ No newline at end of file diff --git a/src/hip_module.cpp b/src/hip_module.cpp index 00ffd8b03b..1477247ae2 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -37,6 +37,7 @@ THE SOFTWARE. #include "elfio/elfio.hpp" #include "hip/hip_runtime.h" +#include "hip/hcc_detail/program_state.hpp" #include "hip_hcc_internal.h" #include "trace_helper.h" diff --git a/src/macro_based_grid_launch.inl b/src/macro_based_grid_launch.inl new file mode 100644 index 0000000000..ad5340c097 --- /dev/null +++ b/src/macro_based_grid_launch.inl @@ -0,0 +1,99 @@ +/* +Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "hip/hcc_detail/grid_launch_GGL.hpp" + +// Internal header, do not percolate upwards. +#include "hip_hcc_internal.h" +#include "hc.hpp" +#include "trace_helper.h" + +#include +#include + +namespace hip_impl +{ + hc::accelerator_view lock_stream_hip_( + hipStream_t& stream, void*& locked_stream) + { // This allocated but does not take ownership of locked_stream. If it is + // not deleted elsewhere it will leak. + using L = decltype(stream->lockopen_preKernelCommand()); + + HIP_INIT(); + + stream = ihipSyncAndResolveStream(stream); + locked_stream = new L{stream->lockopen_preKernelCommand()}; + return (*static_cast(locked_stream))->_av; + } + + void print_prelaunch_trace_( + const char* kernel_name, + dim3 num_blocks, + dim3 dim_blocks, + int group_mem_bytes, + hipStream_t stream) + { + if ((HIP_TRACE_API & (1 << TRACE_KCMD)) || + HIP_PROFILE_API || + (COMPILE_HIP_DB && (HIP_TRACE_API & (1<lockopen_preKernelCommand()); + + stream->lockclose_postKernelCommand(kernel_name, acc_v); + + delete static_cast(locked_stream); + locked_stream = nullptr; + if(HIP_PROFILE_API) { + MARKER_END(); + } + } +} \ No newline at end of file From faa546d19430e993d3ad1c3fac1545f0ef379d68 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 29 Nov 2017 01:37:52 +0000 Subject: [PATCH 21/35] Fix oversight in selection mechanism which led to erroneous code to be compiled for the grid_launch_GGL component. --- include/hip/hcc_detail/grid_launch_GGL.hpp | 2 +- src/functional_grid_launch.inl | 1 - src/grid_launch.cpp | 4 +++- src/macro_based_grid_launch.inl | 2 -- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/include/hip/hcc_detail/grid_launch_GGL.hpp b/include/hip/hcc_detail/grid_launch_GGL.hpp index 187d84dbff..10dae540a4 100644 --- a/include/hip/hcc_detail/grid_launch_GGL.hpp +++ b/include/hip/hcc_detail/grid_launch_GGL.hpp @@ -22,7 +22,7 @@ THE SOFTWARE. #if GENERIC_GRID_LAUNCH == 1 #if __hcc_workweek__ >= 17481 - #define FUNCTIONAL_GRID_LAUNCH + #define FUNCTIONAL_GRID_LAUNCH 1 #include "functional_grid_launch.hpp" #else #include "macro_based_grid_launch.hpp" diff --git a/src/functional_grid_launch.inl b/src/functional_grid_launch.inl index 4a26f66c8c..b555967ebc 100644 --- a/src/functional_grid_launch.inl +++ b/src/functional_grid_launch.inl @@ -20,7 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "hip/hcc_detail/grid_launch_GGL.hpp" #include "hip/hcc_detail/program_state.hpp" #include "hip/hip_runtime_api.h" diff --git a/src/grid_launch.cpp b/src/grid_launch.cpp index 484d314fba..d63fd2d49a 100644 --- a/src/grid_launch.cpp +++ b/src/grid_launch.cpp @@ -20,7 +20,9 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#if defined(FUNCTIONAL_GRID_LAUNCH) +#include "hip/hcc_detail/grid_launch_GGL.hpp" + +#if !defined(FUNCTIONAL_GRID_LAUNCH) #include "functional_grid_launch.inl" #else #include "macro_based_grid_launch.inl" diff --git a/src/macro_based_grid_launch.inl b/src/macro_based_grid_launch.inl index ad5340c097..5547d3a71a 100644 --- a/src/macro_based_grid_launch.inl +++ b/src/macro_based_grid_launch.inl @@ -20,8 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "hip/hcc_detail/grid_launch_GGL.hpp" - // Internal header, do not percolate upwards. #include "hip_hcc_internal.h" #include "hc.hpp" From 3ed8897a5aefc74507006d63a857093895de1090 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 29 Nov 2017 02:16:44 +0000 Subject: [PATCH 22/35] Add missing file. --- src/grid_launch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/grid_launch.cpp b/src/grid_launch.cpp index d63fd2d49a..8eb3f1dc75 100644 --- a/src/grid_launch.cpp +++ b/src/grid_launch.cpp @@ -22,7 +22,7 @@ THE SOFTWARE. #include "hip/hcc_detail/grid_launch_GGL.hpp" -#if !defined(FUNCTIONAL_GRID_LAUNCH) +#if defined(FUNCTIONAL_GRID_LAUNCH) #include "functional_grid_launch.inl" #else #include "macro_based_grid_launch.inl" From b881cf713ca674a784cc151b6184ccc72441ca61 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 29 Nov 2017 03:05:53 +0000 Subject: [PATCH 23/35] Fix compiler version check. --- include/hip/hcc_detail/grid_launch_GGL.hpp | 2 +- src/grid_launch.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/hip/hcc_detail/grid_launch_GGL.hpp b/include/hip/hcc_detail/grid_launch_GGL.hpp index 10dae540a4..95903436b6 100644 --- a/include/hip/hcc_detail/grid_launch_GGL.hpp +++ b/include/hip/hcc_detail/grid_launch_GGL.hpp @@ -19,10 +19,10 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#pragma once #if GENERIC_GRID_LAUNCH == 1 #if __hcc_workweek__ >= 17481 - #define FUNCTIONAL_GRID_LAUNCH 1 #include "functional_grid_launch.hpp" #else #include "macro_based_grid_launch.hpp" diff --git a/src/grid_launch.cpp b/src/grid_launch.cpp index 8eb3f1dc75..1fe47c189a 100644 --- a/src/grid_launch.cpp +++ b/src/grid_launch.cpp @@ -22,7 +22,7 @@ THE SOFTWARE. #include "hip/hcc_detail/grid_launch_GGL.hpp" -#if defined(FUNCTIONAL_GRID_LAUNCH) +#if __hcc_workweek__ >= 17481 #include "functional_grid_launch.inl" #else #include "macro_based_grid_launch.inl" From d2fd1f5544b5474f5b5128461313707300621511 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 29 Nov 2017 21:01:28 +0000 Subject: [PATCH 24/35] Revert adoption of CUDA indexing in general - this can only work with later versions of the compiler, just like module based dispatch, and thus must be guarded against usage in earlier (e.g. 1.6) versions. --- include/hip/hcc_detail/hip_runtime.h | 40 ++++++++++--------- samples/0_Intro/square/square.cu | 4 +- src/device_util.cpp | 4 +- src/hip_memory.cpp | 4 +- .../device/hipFuncDeviceSynchronize.cpp | 2 +- tests/src/deviceLib/hipComplex.cpp | 2 +- tests/src/deviceLib/hipDeviceMemcpy.cpp | 4 +- tests/src/deviceLib/hipFloatMath.cpp | 2 +- tests/src/deviceLib/hipSimpleAtomicsTest.cpp | 2 +- tests/src/deviceLib/hipTestDevice.cpp | 32 +++++++-------- tests/src/deviceLib/hipTestDeviceDouble.cpp | 28 ++++++------- tests/src/deviceLib/hipTestDeviceSymbol.cpp | 2 +- tests/src/deviceLib/hipTestHalf.cpp | 4 +- tests/src/deviceLib/hipThreadFence.cpp | 2 +- tests/src/deviceLib/hip_anyall.cpp | 6 +-- tests/src/deviceLib/hip_ballot.cpp | 8 ++-- tests/src/deviceLib/hip_brev.cpp | 4 +- tests/src/deviceLib/hip_clz.cpp | 4 +- tests/src/deviceLib/hip_ffs.cpp | 4 +- tests/src/deviceLib/hip_popc.cpp | 4 +- tests/src/deviceLib/hip_test_ldg.cpp | 4 +- tests/src/deviceLib/hip_test_make_type.cpp | 20 +++++----- tests/src/deviceLib/hip_trig.cpp | 2 +- tests/src/experimental/xcompile/hHip.c | 2 +- tests/src/experimental/xcompile/hipxxKer.cpp | 2 +- tests/src/experimental/xcompile/hxxHip.cpp | 2 +- tests/src/hipC.c | 2 +- tests/src/hipC.cpp | 2 +- tests/src/hipCKernel.c | 2 +- tests/src/kernel/hipDynamicShared.cpp | 4 +- tests/src/kernel/hipDynamicShared2.cpp | 2 +- tests/src/kernel/hipGridLaunch.cpp | 4 +- tests/src/kernel/hipLanguageExtensions.cpp | 8 ++-- tests/src/kernel/hipTestConstant.cpp | 2 +- tests/src/kernel/hipTestMallocKernel.cpp | 4 +- tests/src/kernel/hipTestMemKernel.cpp | 20 +++++----- tests/src/kernel/inline_asm_vadd.cpp | 2 +- tests/src/kernel/inline_asm_vmac.cpp | 2 +- tests/src/kernel/launch_bounds.cpp | 2 +- .../device/hipDeviceSynchronize.cpp | 2 +- .../src/runtimeApi/memory/hipHostGetFlags.cpp | 2 +- tests/src/runtimeApi/memory/hipHostMalloc.cpp | 4 +- .../src/runtimeApi/memory/hipHostRegister.cpp | 2 +- .../src/runtimeApi/memory/hipMemcpyAsync.cpp | 4 +- .../memory/hipMemoryAllocateCoherent.cpp | 2 +- .../runtimeApi/memory/p2p_copy_coherency.cpp | 8 ++-- tests/src/runtimeApi/module/hipModule.cpp | 2 +- tests/src/runtimeApi/module/vcpy_kernel.cpp | 2 +- .../multiThread/hipMultiThreadStreams2.cpp | 2 +- .../runtimeApi/stream/hipAPIStreamDisable.cpp | 4 +- .../runtimeApi/stream/hipAPIStreamEnable.cpp | 4 +- tests/src/runtimeApi/stream/hipNullStream.cpp | 4 +- tests/src/runtimeApi/stream/hipStream.h | 2 +- .../synchronization/copy_coherency.cpp | 8 ++-- .../synchronization/memcpyInt.device.cpp | 4 +- tests/src/specialFunc.cu | 2 +- tests/src/stress/hipStressAsync.cpp | 2 +- tests/src/stress/hipStressChain.cpp | 2 +- tests/src/stress/hipStressKernel.cpp | 2 +- tests/src/stress/hipStressSync.cpp | 2 +- tests/src/test_common.h | 20 +++++----- tests/src/texture/hipTextureObj2D.cpp | 4 +- tests/src/texture/hipTextureRef2D.cpp | 4 +- 63 files changed, 173 insertions(+), 171 deletions(-) diff --git a/include/hip/hcc_detail/hip_runtime.h b/include/hip/hcc_detail/hip_runtime.h index 924e774af0..944f74864b 100644 --- a/include/hip/hcc_detail/hip_runtime.h +++ b/include/hip/hcc_detail/hip_runtime.h @@ -381,27 +381,29 @@ __device__ void __threadfence_system(void) ; * @} */ -template::type f> -class Coordinates { - using R = decltype(f(0)); +#if __hcc_workweek__ >= 17481 + template::type f> + class Coordinates { + using R = decltype(f(0)); - struct X { __device__ operator R() const { return f(0); } }; - struct Y { __device__ operator R() const { return f(1); } }; - struct Z { __device__ operator R() const { return f(2); } }; -public: - static constexpr X x{}; - static constexpr Y y{}; - static constexpr Z z{}; -}; + struct X { __device__ operator R() const { return f(0); } }; + struct Y { __device__ operator R() const { return f(1); } }; + struct Z { __device__ operator R() const { return f(2); } }; + public: + static constexpr X x{}; + static constexpr Y y{}; + static constexpr Z z{}; + }; -static constexpr Coordinates blockDim; -static constexpr Coordinates blockIdx; -static constexpr Coordinates gridDim; -static constexpr Coordinates threadIdx; + static constexpr Coordinates blockDim; + static constexpr Coordinates blockIdx; + static constexpr Coordinates gridDim; + static constexpr Coordinates threadIdx; +#endif #define hipThreadIdx_x (hc_get_workitem_id(0)) #define hipThreadIdx_y (hc_get_workitem_id(1)) diff --git a/samples/0_Intro/square/square.cu b/samples/0_Intro/square/square.cu index 82b31db14a..ccaa9ae0bc 100644 --- a/samples/0_Intro/square/square.cu +++ b/samples/0_Intro/square/square.cu @@ -40,8 +40,8 @@ template __global__ void vector_square(T *C_d, const T *A_d, size_t N) { - size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); - size_t stride = blockDim.x * gridDim.x ; + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; for (size_t i=offset; i(&f[idx]), diff --git a/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp b/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp index c8c2e644c3..dac56bf709 100644 --- a/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp +++ b/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp @@ -34,7 +34,7 @@ THE SOFTWARE. #define NUM_STREAMS 2 __global__ void Iter(hipLaunchParm lp, int *Ad, int num){ - int tx = threadIdx.x + blockIdx.x * blockDim.x; + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; // Kernel loop designed to execute very slowly... ... ... so we can test timing-related behavior below if(tx == 0){ for(int i = 0; i>pshift] = __any(tid -77); - device_all[threadIdx.x>>pshift] = __all(tid -77); + int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + device_any[hipThreadIdx_x>>pshift] = __any(tid -77); + device_all[hipThreadIdx_x>>pshift] = __all(tid -77); } int main(int argc, char *argv[]) diff --git a/tests/src/deviceLib/hip_ballot.cpp b/tests/src/deviceLib/hip_ballot.cpp index 14b8f314a1..742c47a065 100644 --- a/tests/src/deviceLib/hip_ballot.cpp +++ b/tests/src/deviceLib/hip_ballot.cpp @@ -34,12 +34,12 @@ __global__ void gpu_ballot(hipLaunchParm lp, unsigned int* device_ballot, int Num_Warps_per_Block,int pshift) { - int tid = threadIdx.x + blockIdx.x * blockDim.x; - const unsigned int warp_num = threadIdx.x >> pshift; + int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + const unsigned int warp_num = hipThreadIdx_x >> pshift; #ifdef __HIP_PLATFORM_HCC__ - atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popcll(__ballot(tid - 245))); + atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popcll(__ballot(tid - 245))); #else - atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popc(__ballot(tid - 245))); + atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popc(__ballot(tid - 245))); #endif } diff --git a/tests/src/deviceLib/hip_brev.cpp b/tests/src/deviceLib/hip_brev.cpp index c08c39dec9..855a8bec47 100644 --- a/tests/src/deviceLib/hip_brev.cpp +++ b/tests/src/deviceLib/hip_brev.cpp @@ -72,8 +72,8 @@ HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b,unsigned long long int* c, unsigned long long int* d, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { diff --git a/tests/src/deviceLib/hip_clz.cpp b/tests/src/deviceLib/hip_clz.cpp index 53fd611184..bdb31f3e8d 100644 --- a/tests/src/deviceLib/hip_clz.cpp +++ b/tests/src/deviceLib/hip_clz.cpp @@ -83,8 +83,8 @@ HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b,unsigned int* c, unsigned long long int* d, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { diff --git a/tests/src/deviceLib/hip_ffs.cpp b/tests/src/deviceLib/hip_ffs.cpp index 49530bb298..c855ede060 100644 --- a/tests/src/deviceLib/hip_ffs.cpp +++ b/tests/src/deviceLib/hip_ffs.cpp @@ -73,8 +73,8 @@ HIP_kernel(hipLaunchParm lp, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { diff --git a/tests/src/deviceLib/hip_popc.cpp b/tests/src/deviceLib/hip_popc.cpp index 19dafb4d43..e503e55b42 100644 --- a/tests/src/deviceLib/hip_popc.cpp +++ b/tests/src/deviceLib/hip_popc.cpp @@ -64,8 +64,8 @@ HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b,unsigned int* c, unsigned long long int* d, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { diff --git a/tests/src/deviceLib/hip_test_ldg.cpp b/tests/src/deviceLib/hip_test_ldg.cpp index 4db522cc10..5540c4917d 100644 --- a/tests/src/deviceLib/hip_test_ldg.cpp +++ b/tests/src/deviceLib/hip_test_ldg.cpp @@ -57,8 +57,8 @@ vectoradd_float(hipLaunchParm lp, T* a, const T* bm, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { diff --git a/tests/src/deviceLib/hip_test_make_type.cpp b/tests/src/deviceLib/hip_test_make_type.cpp index 6eba236e12..ce689ceb89 100644 --- a/tests/src/deviceLib/hip_test_make_type.cpp +++ b/tests/src/deviceLib/hip_test_make_type.cpp @@ -45,8 +45,8 @@ vectoradd_char1(hipLaunchParm lp, char1* a, const char1* bm, const char1* cm, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { @@ -59,8 +59,8 @@ vectoradd_char2(hipLaunchParm lp, char2* a, const char2* bm, const char2* cm, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { @@ -73,8 +73,8 @@ vectoradd_char3(hipLaunchParm lp, char3* a, const char3* bm, const char3* cm, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { @@ -86,8 +86,8 @@ vectoradd_char4(hipLaunchParm lp, char4* a, const char4* bm, const char4* cm, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { @@ -100,8 +100,8 @@ vectoradd_char4(hipLaunchParm lp, __kernel__ void vectoradd_float(float* a, const float* b, const float* c, int width, int height) { - int x = blockDimX * blockIdx.x + threadIdx.x; - int y = blockDimY * blockIdy.y + threadIdx.y; + int x = blockDimX * hipBlockIdx_x + hipThreadIdx_x; + int y = blockDimY * blockIdy.y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { diff --git a/tests/src/deviceLib/hip_trig.cpp b/tests/src/deviceLib/hip_trig.cpp index 6ee8dc58ad..5ec28101f3 100644 --- a/tests/src/deviceLib/hip_trig.cpp +++ b/tests/src/deviceLib/hip_trig.cpp @@ -36,7 +36,7 @@ THE SOFTWARE. #define SIZE LEN<<2 __global__ void kernel_trig(hipLaunchParm lp, float *In, float *sin_d, float *cos_d, float *tan_d, float *sin_pd, float *cos_pd){ - int tid = threadIdx.x + blockIdx.x * blockDim.x; + int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; sin_d[tid] = __sinf(In[tid]); cos_d[tid] = __cosf(In[tid]); tan_d[tid] = __tanf(In[tid]); diff --git a/tests/src/experimental/xcompile/hHip.c b/tests/src/experimental/xcompile/hHip.c index 17e7e9ecf6..2ac4ebc73e 100644 --- a/tests/src/experimental/xcompile/hHip.c +++ b/tests/src/experimental/xcompile/hHip.c @@ -29,7 +29,7 @@ THE SOFTWARE. __global__ void Add(hipLaunchParm lp, float *Ad, float *Bd, float *Cd, size_t len) { - int tx = threadIdx.x + blockIdx.x * blockDim.x; + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; if(tx < len) { Cd[tx] = Ad[tx] + Bd[tx]; diff --git a/tests/src/experimental/xcompile/hipxxKer.cpp b/tests/src/experimental/xcompile/hipxxKer.cpp index 5dca6c1bca..d1bbed63cd 100644 --- a/tests/src/experimental/xcompile/hipxxKer.cpp +++ b/tests/src/experimental/xcompile/hipxxKer.cpp @@ -30,7 +30,7 @@ THE SOFTWARE. __global__ void Kern(hipLaunchParm lp, float *A) { - int tx = threadIdx.x + blockIdx.x * blockDim.x; + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; A[tx] += 1.0f; } diff --git a/tests/src/experimental/xcompile/hxxHip.cpp b/tests/src/experimental/xcompile/hxxHip.cpp index bca5d64afc..6a748d5c89 100644 --- a/tests/src/experimental/xcompile/hxxHip.cpp +++ b/tests/src/experimental/xcompile/hxxHip.cpp @@ -33,7 +33,7 @@ class memManager; template __global__ void Add(hipLaunchParm lp, T* Ad, T* Bd, T* Cd, size_t Len) { - int tx = threadIdx.x + blockIdx.x * blockDim.x; + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; if(tx < Len) { Cd[tx] = Ad[tx] + Bd[tx]; diff --git a/tests/src/hipC.c b/tests/src/hipC.c index efa03bb909..644df6c98f 100644 --- a/tests/src/hipC.c +++ b/tests/src/hipC.c @@ -34,7 +34,7 @@ THE SOFTWARE. #define SIZE 1024*1024*sizeof(int) __global__ void Iter(hipLaunchParm lp, int *Ad){ - int tx = threadIdx.x + blockIdx.x * blockDim.x; + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; if(tx == 0){ for(int i=0;i(my_sdata); #endif - size_t gid = (blockIdx.x * blockDim.x + threadIdx.x); - size_t tid = threadIdx.x; + size_t gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t tid = hipThreadIdx_x; // initialize dynamic shared memory if (tid < groupElements) { diff --git a/tests/src/kernel/hipDynamicShared2.cpp b/tests/src/kernel/hipDynamicShared2.cpp index 4567ff6fc2..95e70a9956 100644 --- a/tests/src/kernel/hipDynamicShared2.cpp +++ b/tests/src/kernel/hipDynamicShared2.cpp @@ -34,7 +34,7 @@ THE SOFTWARE. __global__ void vectorAdd(hipLaunchParm lp, float *Ad, float *Bd) { HIP_DYNAMIC_SHARED(float, sBd); - int tx = threadIdx.x; + int tx = hipThreadIdx_x; for(int i=0;i __global__ void Inc(hipLaunchParm lp, float *Ad){ - int tx = threadIdx.x + blockIdx.x * blockDim.x; + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; Ad[tx] = Ad[tx] + float(1); } diff --git a/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp b/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp index 5cd46c808a..c4f4b23dc0 100644 --- a/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp +++ b/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp @@ -70,8 +70,8 @@ template __global__ void addK (hipLaunchParm lp, T *A, T K, size_t numElements) { - size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); - size_t stride = blockDim.x * gridDim.x ; + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; for (size_t i=offset; i __global__ void Inc(hipLaunchParm lp, T *Array){ -int tx = threadIdx.x + blockIdx.x * blockDim.x; +int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; Array[tx] = Array[tx] + T(1); } diff --git a/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp b/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp index 66b93a164f..4e343121ed 100644 --- a/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp +++ b/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp @@ -29,7 +29,7 @@ THE SOFTWARE. const int NN = 1 << 21; __global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){ - int tid = threadIdx.x; + int tid = hipThreadIdx_x; if(tid < 1){ for(int i=0;i __global__ void Inc(hipLaunchParm lp, T *In){ -int tx = threadIdx.x + blockIdx.x * blockDim.x; +int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; In[tx] = In[tx] + 1; } diff --git a/tests/src/runtimeApi/synchronization/copy_coherency.cpp b/tests/src/runtimeApi/synchronization/copy_coherency.cpp index b2a66f61e2..e4bfb98206 100644 --- a/tests/src/runtimeApi/synchronization/copy_coherency.cpp +++ b/tests/src/runtimeApi/synchronization/copy_coherency.cpp @@ -102,8 +102,8 @@ MemcpyFunction g_moduleMemcpy("memcpyInt.hsaco", "memcpyIntKernel"); __global__ void memsetIntKernel(int * ptr, const int val, size_t numElements) { - int gid = (blockIdx.x * blockDim.x + threadIdx.x); - int stride = blockDim.x * gridDim.x ; + int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + int stride = hipBlockDim_x * hipGridDim_x ; for (size_t i= gid; i< numElements; i+=stride){ ptr[i] = val; } @@ -112,8 +112,8 @@ memsetIntKernel(int * ptr, const int val, size_t numElements) __global__ void memcpyIntKernel(int *dst, const int * src, size_t numElements) { - int gid = (blockIdx.x * blockDim.x + threadIdx.x); - int stride = blockDim.x * gridDim.x ; + int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + int stride = hipBlockDim_x * hipGridDim_x ; for (size_t i= gid; i< numElements; i+=stride){ dst[i] = src[i]; } diff --git a/tests/src/runtimeApi/synchronization/memcpyInt.device.cpp b/tests/src/runtimeApi/synchronization/memcpyInt.device.cpp index 2916d51bf9..b34d331682 100644 --- a/tests/src/runtimeApi/synchronization/memcpyInt.device.cpp +++ b/tests/src/runtimeApi/synchronization/memcpyInt.device.cpp @@ -5,8 +5,8 @@ extern "C" __global__ void memcpyIntKernel(hipLaunchParm lp, int *dst, const int * src, size_t numElements) { - int gid = (blockIdx.x * blockDim.x + threadIdx.x); - int stride = blockDim.x * gridDim.x ; + int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + int stride = hipBlockDim_x * hipGridDim_x ; for (size_t i= gid; i< numElements; i+=stride){ dst[i] = src[i]; } diff --git a/tests/src/specialFunc.cu b/tests/src/specialFunc.cu index 085be062d9..744dcd8926 100644 --- a/tests/src/specialFunc.cu +++ b/tests/src/specialFunc.cu @@ -23,7 +23,7 @@ THE SOFTWARE. void __global__ test_kernel(float *A) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; + int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; float a = __ballot(tid < 16); float b = __shfl(tid < 16); diff --git a/tests/src/stress/hipStressAsync.cpp b/tests/src/stress/hipStressAsync.cpp index a142b41730..1f8cab1a36 100644 --- a/tests/src/stress/hipStressAsync.cpp +++ b/tests/src/stress/hipStressAsync.cpp @@ -30,7 +30,7 @@ THE SOFTWARE. #define ITER 1<<10 __global__ void Iter(hipLaunchParm lp, int *Ad, int num){ - int tx = threadIdx.x + blockIdx.x * blockDim.x; + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; if(tx == 0){ for(int i = 0; i=0; i-=stride) { C_d[i] = A_d[i] + B_d[i]; @@ -169,8 +169,8 @@ addCount( const T *A_d, size_t NELEM, int count) { - size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); - size_t stride = blockDim.x * gridDim.x ; + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; // Deliberately do this in an inefficient way to increase kernel runtime for (int i=0; i=0; i-=stride) { C_d[i] = val; diff --git a/tests/src/texture/hipTextureObj2D.cpp b/tests/src/texture/hipTextureObj2D.cpp index 9ddafd6b1c..443d708418 100644 --- a/tests/src/texture/hipTextureObj2D.cpp +++ b/tests/src/texture/hipTextureObj2D.cpp @@ -17,8 +17,8 @@ __global__ void tex2DKernel(float* outputData, int width, int height) { - int x = blockIdx.x*blockDim.x + threadIdx.x; - int y = blockIdx.y*blockDim.y + threadIdx.y; + int x = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x; + int y = hipBlockIdx_y*hipBlockDim_y + hipThreadIdx_y; outputData[y*width + x] = tex2D(textureObject, x, y); } diff --git a/tests/src/texture/hipTextureRef2D.cpp b/tests/src/texture/hipTextureRef2D.cpp index c42f09d5a0..ebc7a04385 100644 --- a/tests/src/texture/hipTextureRef2D.cpp +++ b/tests/src/texture/hipTextureRef2D.cpp @@ -20,8 +20,8 @@ __global__ void tex2DKernel(float* outputData, int width, int height) { - int x = blockIdx.x*blockDim.x + threadIdx.x; - int y = blockIdx.y*blockDim.y + threadIdx.y; + int x = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x; + int y = hipBlockIdx_y*hipBlockDim_y + hipThreadIdx_y; #ifdef __HIP_PLATFORM_HCC__ outputData[y*width + x] = tex2D(tex, textureObject, x, y); #else From fbaf729f88353841ca2eb4f9b24f593bbb3b14ce Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 29 Nov 2017 21:36:29 +0000 Subject: [PATCH 25/35] Revert "Revert adoption of CUDA indexing in general - this can only work with later versions of the compiler, just like module based dispatch, and thus must be guarded against usage in earlier (e.g. 1.6) versions." This reverts commit d2fd1f5 --- samples/0_Intro/square/square.cu | 4 +-- src/device_util.cpp | 4 +-- src/hip_memory.cpp | 4 +-- .../device/hipFuncDeviceSynchronize.cpp | 2 +- tests/src/deviceLib/hipComplex.cpp | 2 +- tests/src/deviceLib/hipDeviceMemcpy.cpp | 4 +-- tests/src/deviceLib/hipFloatMath.cpp | 2 +- tests/src/deviceLib/hipSimpleAtomicsTest.cpp | 2 +- tests/src/deviceLib/hipTestDevice.cpp | 32 +++++++++---------- tests/src/deviceLib/hipTestDeviceDouble.cpp | 28 ++++++++-------- tests/src/deviceLib/hipTestDeviceSymbol.cpp | 2 +- tests/src/deviceLib/hipTestHalf.cpp | 4 +-- tests/src/deviceLib/hipThreadFence.cpp | 2 +- tests/src/deviceLib/hip_anyall.cpp | 6 ++-- tests/src/deviceLib/hip_ballot.cpp | 8 ++--- tests/src/deviceLib/hip_brev.cpp | 4 +-- tests/src/deviceLib/hip_clz.cpp | 4 +-- tests/src/deviceLib/hip_ffs.cpp | 4 +-- tests/src/deviceLib/hip_popc.cpp | 4 +-- tests/src/deviceLib/hip_test_ldg.cpp | 4 +-- tests/src/deviceLib/hip_test_make_type.cpp | 20 ++++++------ tests/src/deviceLib/hip_trig.cpp | 2 +- tests/src/experimental/xcompile/hHip.c | 2 +- tests/src/experimental/xcompile/hipxxKer.cpp | 2 +- tests/src/experimental/xcompile/hxxHip.cpp | 2 +- tests/src/hipC.c | 2 +- tests/src/hipC.cpp | 2 +- tests/src/hipCKernel.c | 2 +- tests/src/kernel/hipDynamicShared.cpp | 4 +-- tests/src/kernel/hipDynamicShared2.cpp | 2 +- tests/src/kernel/hipGridLaunch.cpp | 4 +-- tests/src/kernel/hipLanguageExtensions.cpp | 8 ++--- tests/src/kernel/hipTestConstant.cpp | 2 +- tests/src/kernel/hipTestMallocKernel.cpp | 4 +-- tests/src/kernel/hipTestMemKernel.cpp | 20 ++++++------ tests/src/kernel/inline_asm_vadd.cpp | 2 +- tests/src/kernel/inline_asm_vmac.cpp | 2 +- tests/src/kernel/launch_bounds.cpp | 2 +- .../device/hipDeviceSynchronize.cpp | 2 +- .../src/runtimeApi/memory/hipHostGetFlags.cpp | 2 +- tests/src/runtimeApi/memory/hipHostMalloc.cpp | 4 +-- .../src/runtimeApi/memory/hipHostRegister.cpp | 2 +- .../src/runtimeApi/memory/hipMemcpyAsync.cpp | 4 +-- .../memory/hipMemoryAllocateCoherent.cpp | 2 +- .../runtimeApi/memory/p2p_copy_coherency.cpp | 8 ++--- tests/src/runtimeApi/module/hipModule.cpp | 2 +- tests/src/runtimeApi/module/vcpy_kernel.cpp | 2 +- .../multiThread/hipMultiThreadStreams2.cpp | 2 +- .../runtimeApi/stream/hipAPIStreamDisable.cpp | 4 +-- .../runtimeApi/stream/hipAPIStreamEnable.cpp | 4 +-- tests/src/runtimeApi/stream/hipNullStream.cpp | 4 +-- tests/src/runtimeApi/stream/hipStream.h | 2 +- .../synchronization/copy_coherency.cpp | 8 ++--- .../synchronization/memcpyInt.device.cpp | 4 +-- tests/src/specialFunc.cu | 2 +- tests/src/stress/hipStressAsync.cpp | 2 +- tests/src/stress/hipStressChain.cpp | 2 +- tests/src/stress/hipStressKernel.cpp | 2 +- tests/src/stress/hipStressSync.cpp | 2 +- tests/src/test_common.h | 20 ++++++------ tests/src/texture/hipTextureObj2D.cpp | 4 +-- tests/src/texture/hipTextureRef2D.cpp | 4 +-- 62 files changed, 152 insertions(+), 152 deletions(-) diff --git a/samples/0_Intro/square/square.cu b/samples/0_Intro/square/square.cu index 82b31db14a..ccaa9ae0bc 100644 --- a/samples/0_Intro/square/square.cu +++ b/samples/0_Intro/square/square.cu @@ -40,8 +40,8 @@ template __global__ void vector_square(T *C_d, const T *A_d, size_t N) { - size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); - size_t stride = blockDim.x * gridDim.x ; + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; for (size_t i=offset; i(&f[idx]), diff --git a/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp b/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp index c8c2e644c3..dac56bf709 100644 --- a/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp +++ b/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp @@ -34,7 +34,7 @@ THE SOFTWARE. #define NUM_STREAMS 2 __global__ void Iter(hipLaunchParm lp, int *Ad, int num){ - int tx = threadIdx.x + blockIdx.x * blockDim.x; + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; // Kernel loop designed to execute very slowly... ... ... so we can test timing-related behavior below if(tx == 0){ for(int i = 0; i>pshift] = __any(tid -77); - device_all[threadIdx.x>>pshift] = __all(tid -77); + int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + device_any[hipThreadIdx_x>>pshift] = __any(tid -77); + device_all[hipThreadIdx_x>>pshift] = __all(tid -77); } int main(int argc, char *argv[]) diff --git a/tests/src/deviceLib/hip_ballot.cpp b/tests/src/deviceLib/hip_ballot.cpp index 14b8f314a1..742c47a065 100644 --- a/tests/src/deviceLib/hip_ballot.cpp +++ b/tests/src/deviceLib/hip_ballot.cpp @@ -34,12 +34,12 @@ __global__ void gpu_ballot(hipLaunchParm lp, unsigned int* device_ballot, int Num_Warps_per_Block,int pshift) { - int tid = threadIdx.x + blockIdx.x * blockDim.x; - const unsigned int warp_num = threadIdx.x >> pshift; + int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + const unsigned int warp_num = hipThreadIdx_x >> pshift; #ifdef __HIP_PLATFORM_HCC__ - atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popcll(__ballot(tid - 245))); + atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popcll(__ballot(tid - 245))); #else - atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popc(__ballot(tid - 245))); + atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popc(__ballot(tid - 245))); #endif } diff --git a/tests/src/deviceLib/hip_brev.cpp b/tests/src/deviceLib/hip_brev.cpp index c08c39dec9..855a8bec47 100644 --- a/tests/src/deviceLib/hip_brev.cpp +++ b/tests/src/deviceLib/hip_brev.cpp @@ -72,8 +72,8 @@ HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b,unsigned long long int* c, unsigned long long int* d, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { diff --git a/tests/src/deviceLib/hip_clz.cpp b/tests/src/deviceLib/hip_clz.cpp index 53fd611184..bdb31f3e8d 100644 --- a/tests/src/deviceLib/hip_clz.cpp +++ b/tests/src/deviceLib/hip_clz.cpp @@ -83,8 +83,8 @@ HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b,unsigned int* c, unsigned long long int* d, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { diff --git a/tests/src/deviceLib/hip_ffs.cpp b/tests/src/deviceLib/hip_ffs.cpp index 49530bb298..c855ede060 100644 --- a/tests/src/deviceLib/hip_ffs.cpp +++ b/tests/src/deviceLib/hip_ffs.cpp @@ -73,8 +73,8 @@ HIP_kernel(hipLaunchParm lp, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { diff --git a/tests/src/deviceLib/hip_popc.cpp b/tests/src/deviceLib/hip_popc.cpp index 19dafb4d43..e503e55b42 100644 --- a/tests/src/deviceLib/hip_popc.cpp +++ b/tests/src/deviceLib/hip_popc.cpp @@ -64,8 +64,8 @@ HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b,unsigned int* c, unsigned long long int* d, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { diff --git a/tests/src/deviceLib/hip_test_ldg.cpp b/tests/src/deviceLib/hip_test_ldg.cpp index 4db522cc10..5540c4917d 100644 --- a/tests/src/deviceLib/hip_test_ldg.cpp +++ b/tests/src/deviceLib/hip_test_ldg.cpp @@ -57,8 +57,8 @@ vectoradd_float(hipLaunchParm lp, T* a, const T* bm, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { diff --git a/tests/src/deviceLib/hip_test_make_type.cpp b/tests/src/deviceLib/hip_test_make_type.cpp index 6eba236e12..ce689ceb89 100644 --- a/tests/src/deviceLib/hip_test_make_type.cpp +++ b/tests/src/deviceLib/hip_test_make_type.cpp @@ -45,8 +45,8 @@ vectoradd_char1(hipLaunchParm lp, char1* a, const char1* bm, const char1* cm, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { @@ -59,8 +59,8 @@ vectoradd_char2(hipLaunchParm lp, char2* a, const char2* bm, const char2* cm, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { @@ -73,8 +73,8 @@ vectoradd_char3(hipLaunchParm lp, char3* a, const char3* bm, const char3* cm, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { @@ -86,8 +86,8 @@ vectoradd_char4(hipLaunchParm lp, char4* a, const char4* bm, const char4* cm, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { @@ -100,8 +100,8 @@ vectoradd_char4(hipLaunchParm lp, __kernel__ void vectoradd_float(float* a, const float* b, const float* c, int width, int height) { - int x = blockDimX * blockIdx.x + threadIdx.x; - int y = blockDimY * blockIdy.y + threadIdx.y; + int x = blockDimX * hipBlockIdx_x + hipThreadIdx_x; + int y = blockDimY * blockIdy.y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { diff --git a/tests/src/deviceLib/hip_trig.cpp b/tests/src/deviceLib/hip_trig.cpp index 6ee8dc58ad..5ec28101f3 100644 --- a/tests/src/deviceLib/hip_trig.cpp +++ b/tests/src/deviceLib/hip_trig.cpp @@ -36,7 +36,7 @@ THE SOFTWARE. #define SIZE LEN<<2 __global__ void kernel_trig(hipLaunchParm lp, float *In, float *sin_d, float *cos_d, float *tan_d, float *sin_pd, float *cos_pd){ - int tid = threadIdx.x + blockIdx.x * blockDim.x; + int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; sin_d[tid] = __sinf(In[tid]); cos_d[tid] = __cosf(In[tid]); tan_d[tid] = __tanf(In[tid]); diff --git a/tests/src/experimental/xcompile/hHip.c b/tests/src/experimental/xcompile/hHip.c index 17e7e9ecf6..2ac4ebc73e 100644 --- a/tests/src/experimental/xcompile/hHip.c +++ b/tests/src/experimental/xcompile/hHip.c @@ -29,7 +29,7 @@ THE SOFTWARE. __global__ void Add(hipLaunchParm lp, float *Ad, float *Bd, float *Cd, size_t len) { - int tx = threadIdx.x + blockIdx.x * blockDim.x; + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; if(tx < len) { Cd[tx] = Ad[tx] + Bd[tx]; diff --git a/tests/src/experimental/xcompile/hipxxKer.cpp b/tests/src/experimental/xcompile/hipxxKer.cpp index 5dca6c1bca..d1bbed63cd 100644 --- a/tests/src/experimental/xcompile/hipxxKer.cpp +++ b/tests/src/experimental/xcompile/hipxxKer.cpp @@ -30,7 +30,7 @@ THE SOFTWARE. __global__ void Kern(hipLaunchParm lp, float *A) { - int tx = threadIdx.x + blockIdx.x * blockDim.x; + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; A[tx] += 1.0f; } diff --git a/tests/src/experimental/xcompile/hxxHip.cpp b/tests/src/experimental/xcompile/hxxHip.cpp index bca5d64afc..6a748d5c89 100644 --- a/tests/src/experimental/xcompile/hxxHip.cpp +++ b/tests/src/experimental/xcompile/hxxHip.cpp @@ -33,7 +33,7 @@ class memManager; template __global__ void Add(hipLaunchParm lp, T* Ad, T* Bd, T* Cd, size_t Len) { - int tx = threadIdx.x + blockIdx.x * blockDim.x; + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; if(tx < Len) { Cd[tx] = Ad[tx] + Bd[tx]; diff --git a/tests/src/hipC.c b/tests/src/hipC.c index efa03bb909..644df6c98f 100644 --- a/tests/src/hipC.c +++ b/tests/src/hipC.c @@ -34,7 +34,7 @@ THE SOFTWARE. #define SIZE 1024*1024*sizeof(int) __global__ void Iter(hipLaunchParm lp, int *Ad){ - int tx = threadIdx.x + blockIdx.x * blockDim.x; + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; if(tx == 0){ for(int i=0;i(my_sdata); #endif - size_t gid = (blockIdx.x * blockDim.x + threadIdx.x); - size_t tid = threadIdx.x; + size_t gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t tid = hipThreadIdx_x; // initialize dynamic shared memory if (tid < groupElements) { diff --git a/tests/src/kernel/hipDynamicShared2.cpp b/tests/src/kernel/hipDynamicShared2.cpp index 4567ff6fc2..95e70a9956 100644 --- a/tests/src/kernel/hipDynamicShared2.cpp +++ b/tests/src/kernel/hipDynamicShared2.cpp @@ -34,7 +34,7 @@ THE SOFTWARE. __global__ void vectorAdd(hipLaunchParm lp, float *Ad, float *Bd) { HIP_DYNAMIC_SHARED(float, sBd); - int tx = threadIdx.x; + int tx = hipThreadIdx_x; for(int i=0;i __global__ void Inc(hipLaunchParm lp, float *Ad){ - int tx = threadIdx.x + blockIdx.x * blockDim.x; + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; Ad[tx] = Ad[tx] + float(1); } diff --git a/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp b/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp index 5cd46c808a..c4f4b23dc0 100644 --- a/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp +++ b/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp @@ -70,8 +70,8 @@ template __global__ void addK (hipLaunchParm lp, T *A, T K, size_t numElements) { - size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); - size_t stride = blockDim.x * gridDim.x ; + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; for (size_t i=offset; i __global__ void Inc(hipLaunchParm lp, T *Array){ -int tx = threadIdx.x + blockIdx.x * blockDim.x; +int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; Array[tx] = Array[tx] + T(1); } diff --git a/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp b/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp index 66b93a164f..4e343121ed 100644 --- a/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp +++ b/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp @@ -29,7 +29,7 @@ THE SOFTWARE. const int NN = 1 << 21; __global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){ - int tid = threadIdx.x; + int tid = hipThreadIdx_x; if(tid < 1){ for(int i=0;i __global__ void Inc(hipLaunchParm lp, T *In){ -int tx = threadIdx.x + blockIdx.x * blockDim.x; +int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; In[tx] = In[tx] + 1; } diff --git a/tests/src/runtimeApi/synchronization/copy_coherency.cpp b/tests/src/runtimeApi/synchronization/copy_coherency.cpp index b2a66f61e2..e4bfb98206 100644 --- a/tests/src/runtimeApi/synchronization/copy_coherency.cpp +++ b/tests/src/runtimeApi/synchronization/copy_coherency.cpp @@ -102,8 +102,8 @@ MemcpyFunction g_moduleMemcpy("memcpyInt.hsaco", "memcpyIntKernel"); __global__ void memsetIntKernel(int * ptr, const int val, size_t numElements) { - int gid = (blockIdx.x * blockDim.x + threadIdx.x); - int stride = blockDim.x * gridDim.x ; + int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + int stride = hipBlockDim_x * hipGridDim_x ; for (size_t i= gid; i< numElements; i+=stride){ ptr[i] = val; } @@ -112,8 +112,8 @@ memsetIntKernel(int * ptr, const int val, size_t numElements) __global__ void memcpyIntKernel(int *dst, const int * src, size_t numElements) { - int gid = (blockIdx.x * blockDim.x + threadIdx.x); - int stride = blockDim.x * gridDim.x ; + int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + int stride = hipBlockDim_x * hipGridDim_x ; for (size_t i= gid; i< numElements; i+=stride){ dst[i] = src[i]; } diff --git a/tests/src/runtimeApi/synchronization/memcpyInt.device.cpp b/tests/src/runtimeApi/synchronization/memcpyInt.device.cpp index 2916d51bf9..b34d331682 100644 --- a/tests/src/runtimeApi/synchronization/memcpyInt.device.cpp +++ b/tests/src/runtimeApi/synchronization/memcpyInt.device.cpp @@ -5,8 +5,8 @@ extern "C" __global__ void memcpyIntKernel(hipLaunchParm lp, int *dst, const int * src, size_t numElements) { - int gid = (blockIdx.x * blockDim.x + threadIdx.x); - int stride = blockDim.x * gridDim.x ; + int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + int stride = hipBlockDim_x * hipGridDim_x ; for (size_t i= gid; i< numElements; i+=stride){ dst[i] = src[i]; } diff --git a/tests/src/specialFunc.cu b/tests/src/specialFunc.cu index 085be062d9..744dcd8926 100644 --- a/tests/src/specialFunc.cu +++ b/tests/src/specialFunc.cu @@ -23,7 +23,7 @@ THE SOFTWARE. void __global__ test_kernel(float *A) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; + int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; float a = __ballot(tid < 16); float b = __shfl(tid < 16); diff --git a/tests/src/stress/hipStressAsync.cpp b/tests/src/stress/hipStressAsync.cpp index a142b41730..1f8cab1a36 100644 --- a/tests/src/stress/hipStressAsync.cpp +++ b/tests/src/stress/hipStressAsync.cpp @@ -30,7 +30,7 @@ THE SOFTWARE. #define ITER 1<<10 __global__ void Iter(hipLaunchParm lp, int *Ad, int num){ - int tx = threadIdx.x + blockIdx.x * blockDim.x; + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; if(tx == 0){ for(int i = 0; i=0; i-=stride) { C_d[i] = A_d[i] + B_d[i]; @@ -169,8 +169,8 @@ addCount( const T *A_d, size_t NELEM, int count) { - size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); - size_t stride = blockDim.x * gridDim.x ; + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; // Deliberately do this in an inefficient way to increase kernel runtime for (int i=0; i=0; i-=stride) { C_d[i] = val; diff --git a/tests/src/texture/hipTextureObj2D.cpp b/tests/src/texture/hipTextureObj2D.cpp index 9ddafd6b1c..443d708418 100644 --- a/tests/src/texture/hipTextureObj2D.cpp +++ b/tests/src/texture/hipTextureObj2D.cpp @@ -17,8 +17,8 @@ __global__ void tex2DKernel(float* outputData, int width, int height) { - int x = blockIdx.x*blockDim.x + threadIdx.x; - int y = blockIdx.y*blockDim.y + threadIdx.y; + int x = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x; + int y = hipBlockIdx_y*hipBlockDim_y + hipThreadIdx_y; outputData[y*width + x] = tex2D(textureObject, x, y); } diff --git a/tests/src/texture/hipTextureRef2D.cpp b/tests/src/texture/hipTextureRef2D.cpp index c42f09d5a0..ebc7a04385 100644 --- a/tests/src/texture/hipTextureRef2D.cpp +++ b/tests/src/texture/hipTextureRef2D.cpp @@ -20,8 +20,8 @@ __global__ void tex2DKernel(float* outputData, int width, int height) { - int x = blockIdx.x*blockDim.x + threadIdx.x; - int y = blockIdx.y*blockDim.y + threadIdx.y; + int x = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x; + int y = hipBlockIdx_y*hipBlockDim_y + hipThreadIdx_y; #ifdef __HIP_PLATFORM_HCC__ outputData[y*width + x] = tex2D(tex, textureObject, x, y); #else From 7acb1e6ff6152bc2f58d0761ac00c06415db7c5e Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 29 Nov 2017 21:47:04 +0000 Subject: [PATCH 26/35] Use a much simpler guard for version 1.6, which allows for direct CUDA indexing to be used. --- include/hip/hcc_detail/host_defines.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/include/hip/hcc_detail/host_defines.h b/include/hip/hcc_detail/host_defines.h index 56cfa0cc0f..d600956087 100644 --- a/include/hip/hcc_detail/host_defines.h +++ b/include/hip/hcc_detail/host_defines.h @@ -44,8 +44,12 @@ THE SOFTWARE. #if GENERIC_GRID_LAUNCH == 0 #define __global__ __attribute__((hc_grid_launch)) __attribute__((used)) #else -#define __global__ \ - __attribute__((annotate("__HIP_global_function__"), cpu, hc, used)) + #if __hcc_workweek__ >=17481 + #define __global__ \ + __attribute__((annotate("__HIP_global_function__"), cpu, hc, used)) + #else + #define __global__ __attribute__((hc, used)) + #endif #endif //GENERIC_GRID_LAUNCH #define __noinline__ __attribute__((noinline)) From 32e11e7dc6af2768dc10cac8a3ae96d1367da376 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 29 Nov 2017 21:49:10 +0000 Subject: [PATCH 27/35] Revert "Revert adoption of CUDA indexing in general - this can only work with later versions of the compiler, just like module based dispatch, and thus must be guarded against usage in earlier (e.g. 1.6) versions." This reverts commit d2fd1f5 --- include/hip/hcc_detail/hip_runtime.h | 40 +++++++++---------- samples/0_Intro/square/square.cu | 4 +- src/device_util.cpp | 4 +- src/hip_memory.cpp | 4 +- .../device/hipFuncDeviceSynchronize.cpp | 2 +- tests/src/deviceLib/hipComplex.cpp | 2 +- tests/src/deviceLib/hipDeviceMemcpy.cpp | 4 +- tests/src/deviceLib/hipFloatMath.cpp | 2 +- tests/src/deviceLib/hipSimpleAtomicsTest.cpp | 2 +- tests/src/deviceLib/hipTestDevice.cpp | 32 +++++++-------- tests/src/deviceLib/hipTestDeviceDouble.cpp | 28 ++++++------- tests/src/deviceLib/hipTestDeviceSymbol.cpp | 2 +- tests/src/deviceLib/hipTestHalf.cpp | 4 +- tests/src/deviceLib/hipThreadFence.cpp | 2 +- tests/src/deviceLib/hip_anyall.cpp | 6 +-- tests/src/deviceLib/hip_ballot.cpp | 8 ++-- tests/src/deviceLib/hip_brev.cpp | 4 +- tests/src/deviceLib/hip_clz.cpp | 4 +- tests/src/deviceLib/hip_ffs.cpp | 4 +- tests/src/deviceLib/hip_popc.cpp | 4 +- tests/src/deviceLib/hip_test_ldg.cpp | 4 +- tests/src/deviceLib/hip_test_make_type.cpp | 20 +++++----- tests/src/deviceLib/hip_trig.cpp | 2 +- tests/src/experimental/xcompile/hHip.c | 2 +- tests/src/experimental/xcompile/hipxxKer.cpp | 2 +- tests/src/experimental/xcompile/hxxHip.cpp | 2 +- tests/src/hipC.c | 2 +- tests/src/hipC.cpp | 2 +- tests/src/hipCKernel.c | 2 +- tests/src/kernel/hipDynamicShared.cpp | 4 +- tests/src/kernel/hipDynamicShared2.cpp | 2 +- tests/src/kernel/hipGridLaunch.cpp | 4 +- tests/src/kernel/hipLanguageExtensions.cpp | 8 ++-- tests/src/kernel/hipTestConstant.cpp | 2 +- tests/src/kernel/hipTestMallocKernel.cpp | 4 +- tests/src/kernel/hipTestMemKernel.cpp | 20 +++++----- tests/src/kernel/inline_asm_vadd.cpp | 2 +- tests/src/kernel/inline_asm_vmac.cpp | 2 +- tests/src/kernel/launch_bounds.cpp | 2 +- .../device/hipDeviceSynchronize.cpp | 2 +- .../src/runtimeApi/memory/hipHostGetFlags.cpp | 2 +- tests/src/runtimeApi/memory/hipHostMalloc.cpp | 4 +- .../src/runtimeApi/memory/hipHostRegister.cpp | 2 +- .../src/runtimeApi/memory/hipMemcpyAsync.cpp | 4 +- .../memory/hipMemoryAllocateCoherent.cpp | 2 +- .../runtimeApi/memory/p2p_copy_coherency.cpp | 8 ++-- tests/src/runtimeApi/module/hipModule.cpp | 2 +- tests/src/runtimeApi/module/vcpy_kernel.cpp | 2 +- .../multiThread/hipMultiThreadStreams2.cpp | 2 +- .../runtimeApi/stream/hipAPIStreamDisable.cpp | 4 +- .../runtimeApi/stream/hipAPIStreamEnable.cpp | 4 +- tests/src/runtimeApi/stream/hipNullStream.cpp | 4 +- tests/src/runtimeApi/stream/hipStream.h | 2 +- .../synchronization/copy_coherency.cpp | 8 ++-- .../synchronization/memcpyInt.device.cpp | 4 +- tests/src/specialFunc.cu | 2 +- tests/src/stress/hipStressAsync.cpp | 2 +- tests/src/stress/hipStressChain.cpp | 2 +- tests/src/stress/hipStressKernel.cpp | 2 +- tests/src/stress/hipStressSync.cpp | 2 +- tests/src/test_common.h | 20 +++++----- tests/src/texture/hipTextureObj2D.cpp | 4 +- tests/src/texture/hipTextureRef2D.cpp | 4 +- 63 files changed, 171 insertions(+), 173 deletions(-) diff --git a/include/hip/hcc_detail/hip_runtime.h b/include/hip/hcc_detail/hip_runtime.h index 944f74864b..924e774af0 100644 --- a/include/hip/hcc_detail/hip_runtime.h +++ b/include/hip/hcc_detail/hip_runtime.h @@ -381,29 +381,27 @@ __device__ void __threadfence_system(void) ; * @} */ -#if __hcc_workweek__ >= 17481 - template::type f> - class Coordinates { - using R = decltype(f(0)); +template::type f> +class Coordinates { + using R = decltype(f(0)); - struct X { __device__ operator R() const { return f(0); } }; - struct Y { __device__ operator R() const { return f(1); } }; - struct Z { __device__ operator R() const { return f(2); } }; - public: - static constexpr X x{}; - static constexpr Y y{}; - static constexpr Z z{}; - }; + struct X { __device__ operator R() const { return f(0); } }; + struct Y { __device__ operator R() const { return f(1); } }; + struct Z { __device__ operator R() const { return f(2); } }; +public: + static constexpr X x{}; + static constexpr Y y{}; + static constexpr Z z{}; +}; - static constexpr Coordinates blockDim; - static constexpr Coordinates blockIdx; - static constexpr Coordinates gridDim; - static constexpr Coordinates threadIdx; -#endif +static constexpr Coordinates blockDim; +static constexpr Coordinates blockIdx; +static constexpr Coordinates gridDim; +static constexpr Coordinates threadIdx; #define hipThreadIdx_x (hc_get_workitem_id(0)) #define hipThreadIdx_y (hc_get_workitem_id(1)) diff --git a/samples/0_Intro/square/square.cu b/samples/0_Intro/square/square.cu index ccaa9ae0bc..82b31db14a 100644 --- a/samples/0_Intro/square/square.cu +++ b/samples/0_Intro/square/square.cu @@ -40,8 +40,8 @@ template __global__ void vector_square(T *C_d, const T *A_d, size_t N) { - size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - size_t stride = hipBlockDim_x * hipGridDim_x ; + size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x ; for (size_t i=offset; i(&f[idx]), diff --git a/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp b/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp index dac56bf709..c8c2e644c3 100644 --- a/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp +++ b/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp @@ -34,7 +34,7 @@ THE SOFTWARE. #define NUM_STREAMS 2 __global__ void Iter(hipLaunchParm lp, int *Ad, int num){ - int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tx = threadIdx.x + blockIdx.x * blockDim.x; // Kernel loop designed to execute very slowly... ... ... so we can test timing-related behavior below if(tx == 0){ for(int i = 0; i>pshift] = __any(tid -77); - device_all[hipThreadIdx_x>>pshift] = __all(tid -77); + int tid = threadIdx.x + blockIdx.x * blockDim.x; + device_any[threadIdx.x>>pshift] = __any(tid -77); + device_all[threadIdx.x>>pshift] = __all(tid -77); } int main(int argc, char *argv[]) diff --git a/tests/src/deviceLib/hip_ballot.cpp b/tests/src/deviceLib/hip_ballot.cpp index 742c47a065..14b8f314a1 100644 --- a/tests/src/deviceLib/hip_ballot.cpp +++ b/tests/src/deviceLib/hip_ballot.cpp @@ -34,12 +34,12 @@ __global__ void gpu_ballot(hipLaunchParm lp, unsigned int* device_ballot, int Num_Warps_per_Block,int pshift) { - int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; - const unsigned int warp_num = hipThreadIdx_x >> pshift; + int tid = threadIdx.x + blockIdx.x * blockDim.x; + const unsigned int warp_num = threadIdx.x >> pshift; #ifdef __HIP_PLATFORM_HCC__ - atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popcll(__ballot(tid - 245))); + atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popcll(__ballot(tid - 245))); #else - atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popc(__ballot(tid - 245))); + atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popc(__ballot(tid - 245))); #endif } diff --git a/tests/src/deviceLib/hip_brev.cpp b/tests/src/deviceLib/hip_brev.cpp index 855a8bec47..c08c39dec9 100644 --- a/tests/src/deviceLib/hip_brev.cpp +++ b/tests/src/deviceLib/hip_brev.cpp @@ -72,8 +72,8 @@ HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b,unsigned long long int* c, unsigned long long int* d, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { diff --git a/tests/src/deviceLib/hip_clz.cpp b/tests/src/deviceLib/hip_clz.cpp index bdb31f3e8d..53fd611184 100644 --- a/tests/src/deviceLib/hip_clz.cpp +++ b/tests/src/deviceLib/hip_clz.cpp @@ -83,8 +83,8 @@ HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b,unsigned int* c, unsigned long long int* d, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { diff --git a/tests/src/deviceLib/hip_ffs.cpp b/tests/src/deviceLib/hip_ffs.cpp index c855ede060..49530bb298 100644 --- a/tests/src/deviceLib/hip_ffs.cpp +++ b/tests/src/deviceLib/hip_ffs.cpp @@ -73,8 +73,8 @@ HIP_kernel(hipLaunchParm lp, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { diff --git a/tests/src/deviceLib/hip_popc.cpp b/tests/src/deviceLib/hip_popc.cpp index e503e55b42..19dafb4d43 100644 --- a/tests/src/deviceLib/hip_popc.cpp +++ b/tests/src/deviceLib/hip_popc.cpp @@ -64,8 +64,8 @@ HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b,unsigned int* c, unsigned long long int* d, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { diff --git a/tests/src/deviceLib/hip_test_ldg.cpp b/tests/src/deviceLib/hip_test_ldg.cpp index 5540c4917d..4db522cc10 100644 --- a/tests/src/deviceLib/hip_test_ldg.cpp +++ b/tests/src/deviceLib/hip_test_ldg.cpp @@ -57,8 +57,8 @@ vectoradd_float(hipLaunchParm lp, T* a, const T* bm, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { diff --git a/tests/src/deviceLib/hip_test_make_type.cpp b/tests/src/deviceLib/hip_test_make_type.cpp index ce689ceb89..6eba236e12 100644 --- a/tests/src/deviceLib/hip_test_make_type.cpp +++ b/tests/src/deviceLib/hip_test_make_type.cpp @@ -45,8 +45,8 @@ vectoradd_char1(hipLaunchParm lp, char1* a, const char1* bm, const char1* cm, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { @@ -59,8 +59,8 @@ vectoradd_char2(hipLaunchParm lp, char2* a, const char2* bm, const char2* cm, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { @@ -73,8 +73,8 @@ vectoradd_char3(hipLaunchParm lp, char3* a, const char3* bm, const char3* cm, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { @@ -86,8 +86,8 @@ vectoradd_char4(hipLaunchParm lp, char4* a, const char4* bm, const char4* cm, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { @@ -100,8 +100,8 @@ vectoradd_char4(hipLaunchParm lp, __kernel__ void vectoradd_float(float* a, const float* b, const float* c, int width, int height) { - int x = blockDimX * hipBlockIdx_x + hipThreadIdx_x; - int y = blockDimY * blockIdy.y + hipThreadIdx_y; + int x = blockDimX * blockIdx.x + threadIdx.x; + int y = blockDimY * blockIdy.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { diff --git a/tests/src/deviceLib/hip_trig.cpp b/tests/src/deviceLib/hip_trig.cpp index 5ec28101f3..6ee8dc58ad 100644 --- a/tests/src/deviceLib/hip_trig.cpp +++ b/tests/src/deviceLib/hip_trig.cpp @@ -36,7 +36,7 @@ THE SOFTWARE. #define SIZE LEN<<2 __global__ void kernel_trig(hipLaunchParm lp, float *In, float *sin_d, float *cos_d, float *tan_d, float *sin_pd, float *cos_pd){ - int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tid = threadIdx.x + blockIdx.x * blockDim.x; sin_d[tid] = __sinf(In[tid]); cos_d[tid] = __cosf(In[tid]); tan_d[tid] = __tanf(In[tid]); diff --git a/tests/src/experimental/xcompile/hHip.c b/tests/src/experimental/xcompile/hHip.c index 2ac4ebc73e..17e7e9ecf6 100644 --- a/tests/src/experimental/xcompile/hHip.c +++ b/tests/src/experimental/xcompile/hHip.c @@ -29,7 +29,7 @@ THE SOFTWARE. __global__ void Add(hipLaunchParm lp, float *Ad, float *Bd, float *Cd, size_t len) { - int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tx = threadIdx.x + blockIdx.x * blockDim.x; if(tx < len) { Cd[tx] = Ad[tx] + Bd[tx]; diff --git a/tests/src/experimental/xcompile/hipxxKer.cpp b/tests/src/experimental/xcompile/hipxxKer.cpp index d1bbed63cd..5dca6c1bca 100644 --- a/tests/src/experimental/xcompile/hipxxKer.cpp +++ b/tests/src/experimental/xcompile/hipxxKer.cpp @@ -30,7 +30,7 @@ THE SOFTWARE. __global__ void Kern(hipLaunchParm lp, float *A) { - int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tx = threadIdx.x + blockIdx.x * blockDim.x; A[tx] += 1.0f; } diff --git a/tests/src/experimental/xcompile/hxxHip.cpp b/tests/src/experimental/xcompile/hxxHip.cpp index 6a748d5c89..bca5d64afc 100644 --- a/tests/src/experimental/xcompile/hxxHip.cpp +++ b/tests/src/experimental/xcompile/hxxHip.cpp @@ -33,7 +33,7 @@ class memManager; template __global__ void Add(hipLaunchParm lp, T* Ad, T* Bd, T* Cd, size_t Len) { - int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tx = threadIdx.x + blockIdx.x * blockDim.x; if(tx < Len) { Cd[tx] = Ad[tx] + Bd[tx]; diff --git a/tests/src/hipC.c b/tests/src/hipC.c index 644df6c98f..efa03bb909 100644 --- a/tests/src/hipC.c +++ b/tests/src/hipC.c @@ -34,7 +34,7 @@ THE SOFTWARE. #define SIZE 1024*1024*sizeof(int) __global__ void Iter(hipLaunchParm lp, int *Ad){ - int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tx = threadIdx.x + blockIdx.x * blockDim.x; if(tx == 0){ for(int i=0;i(my_sdata); #endif - size_t gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - size_t tid = hipThreadIdx_x; + size_t gid = (blockIdx.x * blockDim.x + threadIdx.x); + size_t tid = threadIdx.x; // initialize dynamic shared memory if (tid < groupElements) { diff --git a/tests/src/kernel/hipDynamicShared2.cpp b/tests/src/kernel/hipDynamicShared2.cpp index 95e70a9956..4567ff6fc2 100644 --- a/tests/src/kernel/hipDynamicShared2.cpp +++ b/tests/src/kernel/hipDynamicShared2.cpp @@ -34,7 +34,7 @@ THE SOFTWARE. __global__ void vectorAdd(hipLaunchParm lp, float *Ad, float *Bd) { HIP_DYNAMIC_SHARED(float, sBd); - int tx = hipThreadIdx_x; + int tx = threadIdx.x; for(int i=0;i __global__ void Inc(hipLaunchParm lp, float *Ad){ - int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tx = threadIdx.x + blockIdx.x * blockDim.x; Ad[tx] = Ad[tx] + float(1); } diff --git a/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp b/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp index c4f4b23dc0..5cd46c808a 100644 --- a/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp +++ b/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp @@ -70,8 +70,8 @@ template __global__ void addK (hipLaunchParm lp, T *A, T K, size_t numElements) { - size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - size_t stride = hipBlockDim_x * hipGridDim_x ; + size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x ; for (size_t i=offset; i __global__ void Inc(hipLaunchParm lp, T *Array){ -int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; +int tx = threadIdx.x + blockIdx.x * blockDim.x; Array[tx] = Array[tx] + T(1); } diff --git a/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp b/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp index 4e343121ed..66b93a164f 100644 --- a/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp +++ b/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp @@ -29,7 +29,7 @@ THE SOFTWARE. const int NN = 1 << 21; __global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){ - int tid = hipThreadIdx_x; + int tid = threadIdx.x; if(tid < 1){ for(int i=0;i __global__ void Inc(hipLaunchParm lp, T *In){ -int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; +int tx = threadIdx.x + blockIdx.x * blockDim.x; In[tx] = In[tx] + 1; } diff --git a/tests/src/runtimeApi/synchronization/copy_coherency.cpp b/tests/src/runtimeApi/synchronization/copy_coherency.cpp index e4bfb98206..b2a66f61e2 100644 --- a/tests/src/runtimeApi/synchronization/copy_coherency.cpp +++ b/tests/src/runtimeApi/synchronization/copy_coherency.cpp @@ -102,8 +102,8 @@ MemcpyFunction g_moduleMemcpy("memcpyInt.hsaco", "memcpyIntKernel"); __global__ void memsetIntKernel(int * ptr, const int val, size_t numElements) { - int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - int stride = hipBlockDim_x * hipGridDim_x ; + int gid = (blockIdx.x * blockDim.x + threadIdx.x); + int stride = blockDim.x * gridDim.x ; for (size_t i= gid; i< numElements; i+=stride){ ptr[i] = val; } @@ -112,8 +112,8 @@ memsetIntKernel(int * ptr, const int val, size_t numElements) __global__ void memcpyIntKernel(int *dst, const int * src, size_t numElements) { - int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - int stride = hipBlockDim_x * hipGridDim_x ; + int gid = (blockIdx.x * blockDim.x + threadIdx.x); + int stride = blockDim.x * gridDim.x ; for (size_t i= gid; i< numElements; i+=stride){ dst[i] = src[i]; } diff --git a/tests/src/runtimeApi/synchronization/memcpyInt.device.cpp b/tests/src/runtimeApi/synchronization/memcpyInt.device.cpp index b34d331682..2916d51bf9 100644 --- a/tests/src/runtimeApi/synchronization/memcpyInt.device.cpp +++ b/tests/src/runtimeApi/synchronization/memcpyInt.device.cpp @@ -5,8 +5,8 @@ extern "C" __global__ void memcpyIntKernel(hipLaunchParm lp, int *dst, const int * src, size_t numElements) { - int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - int stride = hipBlockDim_x * hipGridDim_x ; + int gid = (blockIdx.x * blockDim.x + threadIdx.x); + int stride = blockDim.x * gridDim.x ; for (size_t i= gid; i< numElements; i+=stride){ dst[i] = src[i]; } diff --git a/tests/src/specialFunc.cu b/tests/src/specialFunc.cu index 744dcd8926..085be062d9 100644 --- a/tests/src/specialFunc.cu +++ b/tests/src/specialFunc.cu @@ -23,7 +23,7 @@ THE SOFTWARE. void __global__ test_kernel(float *A) { - int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int tid = blockIdx.x * blockDim.x + threadIdx.x; float a = __ballot(tid < 16); float b = __shfl(tid < 16); diff --git a/tests/src/stress/hipStressAsync.cpp b/tests/src/stress/hipStressAsync.cpp index 1f8cab1a36..a142b41730 100644 --- a/tests/src/stress/hipStressAsync.cpp +++ b/tests/src/stress/hipStressAsync.cpp @@ -30,7 +30,7 @@ THE SOFTWARE. #define ITER 1<<10 __global__ void Iter(hipLaunchParm lp, int *Ad, int num){ - int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tx = threadIdx.x + blockIdx.x * blockDim.x; if(tx == 0){ for(int i = 0; i=0; i-=stride) { C_d[i] = A_d[i] + B_d[i]; @@ -169,8 +169,8 @@ addCount( const T *A_d, size_t NELEM, int count) { - size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - size_t stride = hipBlockDim_x * hipGridDim_x ; + size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x ; // Deliberately do this in an inefficient way to increase kernel runtime for (int i=0; i=0; i-=stride) { C_d[i] = val; diff --git a/tests/src/texture/hipTextureObj2D.cpp b/tests/src/texture/hipTextureObj2D.cpp index 443d708418..9ddafd6b1c 100644 --- a/tests/src/texture/hipTextureObj2D.cpp +++ b/tests/src/texture/hipTextureObj2D.cpp @@ -17,8 +17,8 @@ __global__ void tex2DKernel(float* outputData, int width, int height) { - int x = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x; - int y = hipBlockIdx_y*hipBlockDim_y + hipThreadIdx_y; + int x = blockIdx.x*blockDim.x + threadIdx.x; + int y = blockIdx.y*blockDim.y + threadIdx.y; outputData[y*width + x] = tex2D(textureObject, x, y); } diff --git a/tests/src/texture/hipTextureRef2D.cpp b/tests/src/texture/hipTextureRef2D.cpp index ebc7a04385..c42f09d5a0 100644 --- a/tests/src/texture/hipTextureRef2D.cpp +++ b/tests/src/texture/hipTextureRef2D.cpp @@ -20,8 +20,8 @@ __global__ void tex2DKernel(float* outputData, int width, int height) { - int x = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x; - int y = hipBlockIdx_y*hipBlockDim_y + hipThreadIdx_y; + int x = blockIdx.x*blockDim.x + threadIdx.x; + int y = blockIdx.y*blockDim.y + threadIdx.y; #ifdef __HIP_PLATFORM_HCC__ outputData[y*width + x] = tex2D(tex, textureObject, x, y); #else From 20fc68c9a12294118795d4f39463a4a93988bb33 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 29 Nov 2017 21:50:43 +0000 Subject: [PATCH 28/35] Add missing space (the final frontier). --- include/hip/hcc_detail/host_defines.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/hip/hcc_detail/host_defines.h b/include/hip/hcc_detail/host_defines.h index d600956087..a7acdfccf7 100644 --- a/include/hip/hcc_detail/host_defines.h +++ b/include/hip/hcc_detail/host_defines.h @@ -44,7 +44,7 @@ THE SOFTWARE. #if GENERIC_GRID_LAUNCH == 0 #define __global__ __attribute__((hc_grid_launch)) __attribute__((used)) #else - #if __hcc_workweek__ >=17481 + #if __hcc_workweek__ >= 17481 #define __global__ \ __attribute__((annotate("__HIP_global_function__"), cpu, hc, used)) #else From 7c0b9a005bed4eb9a4bc80841dcc78e1d8d813cc Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Thu, 30 Nov 2017 03:29:04 +0000 Subject: [PATCH 29/35] Fix legacy mode detection of the address of an agent allocated variable. In this mode, there exist two executables per each code object, one created by HCC and one created by HIP. Since we dispatch through HCC in legacy mode, we should obtain the address for an agent allocated variable from the latter's executable. Also add two omitted validity checks, whose absence could lead to segfaults when the current process had no .kernel section and / or when an invalid or empty blob was extracted from the latter. --- include/hip/hcc_detail/code_object_bundle.hpp | 2 + src/hip_memory.cpp | 44 ++++++++++++------- src/program_state.cpp | 2 +- 3 files changed, 31 insertions(+), 17 deletions(-) diff --git a/include/hip/hcc_detail/code_object_bundle.hpp b/include/hip/hcc_detail/code_object_bundle.hpp index 05ba44fcc8..72f9d35c73 100644 --- a/include/hip/hcc_detail/code_object_bundle.hpp +++ b/include/hip/hcc_detail/code_object_bundle.hpp @@ -76,6 +76,8 @@ namespace hip_impl RandomAccessIterator l, Bundled_code_header& x) { + if (f == l) return false; + std::copy_n(f, sizeof(x.cbuf_), x.cbuf_); if (valid(x)) { diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index c88a1dabc1..fb25101d7e 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -808,6 +808,26 @@ hipError_t hipHostUnregister(void *hostPtr) return ihipLogStatus(hip_status); } +namespace +{ + inline + hipDeviceptr_t agent_address_for_symbol(const char* symbolName) + { + hipDeviceptr_t r = nullptr; + + #if __hcc_workweek__ >= 17481 + size_t byte_cnt = 0u; + hipModuleGetGlobal(&r, &byte_cnt, 0, symbolName); + #else + auto ctx = ihipGetTlsDefaultCtx(); + auto acc = ctx->getDevice()->_acc; + r = acc.get_symbol_address(symbolName); + #endif + + return r; + } +} + hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t count, size_t offset, hipMemcpyKind kind) { HIP_INIT_SPECIAL_API((TRACE_MCMD), symbolName, src, count, offset, kind); @@ -821,10 +841,8 @@ hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t cou hc::accelerator acc = ctx->getDevice()->_acc; - hipDeviceptr_t dst = nullptr; - size_t byte_cnt = 0u; - auto status = hipModuleGetGlobal( - &dst, &byte_cnt, 0, static_cast(symbolName)); + hipDeviceptr_t dst = + agent_address_for_symbol(static_cast(symbolName)); tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst); if(dst == nullptr) @@ -859,10 +877,8 @@ hipError_t hipMemcpyFromSymbol(void* dst, const void* symbolName, size_t count, hc::accelerator acc = ctx->getDevice()->_acc; - hipDeviceptr_t src = nullptr; - size_t byte_cnt = 0u; - auto status = hipModuleGetGlobal( - &src, &byte_cnt, 0, static_cast(symbolName)); + hipDeviceptr_t src = + agent_address_for_symbol(static_cast(symbolName)); tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst); if(dst == nullptr) @@ -899,10 +915,8 @@ hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void *src, size_ hc::accelerator acc = ctx->getDevice()->_acc; - hipDeviceptr_t dst = nullptr; - size_t byte_cnt = 0u; - auto status = hipModuleGetGlobal( - &dst, &byte_cnt, 0, static_cast(symbolName)); + hipDeviceptr_t dst = + agent_address_for_symbol(static_cast(symbolName)); tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst); if(dst == nullptr) @@ -940,10 +954,8 @@ hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName, size_t co hc::accelerator acc = ctx->getDevice()->_acc; - hipDeviceptr_t src = nullptr; - size_t byte_cnt = 0u; - auto status = hipModuleGetGlobal( - &src, &byte_cnt, 0, static_cast(symbolName)); + hipDeviceptr_t src = + agent_address_for_symbol(static_cast(symbolName)); tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, src); if(src == nullptr || dst == nullptr) diff --git a/src/program_state.cpp b/src/program_state.cpp index 61c90556be..47071d0236 100644 --- a/src/program_state.cpp +++ b/src/program_state.cpp @@ -288,7 +288,7 @@ namespace return x->get_type() == SHT_SYMTAB; }); - r = function_names_for(reader, symtab); + if (symtab) r = function_names_for(reader, symtab); }); return r; From 4313686d6ee5cb28698f0b0b1a441212b8e78b01 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 21 Nov 2017 15:41:15 -0600 Subject: [PATCH 30/35] Fix warning from default cppchek. --- src/hip_hcc_internal.h | 6 +++--- src/hip_memory.cpp | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/hip_hcc_internal.h b/src/hip_hcc_internal.h index 3fd09630d9..69434a6327 100644 --- a/src/hip_hcc_internal.h +++ b/src/hip_hcc_internal.h @@ -248,8 +248,8 @@ static const DbName dbName [] = #if COMPILE_HIP_DB #define tprintf(trace_level, ...) {\ if (HIP_DB & (1<<(trace_level))) {\ - char msgStr[2000];\ - snprintf(msgStr, 2000, __VA_ARGS__);\ + char msgStr[1000];\ + snprintf(msgStr, sizeof(msgStr), __VA_ARGS__);\ fprintf (stderr, " %ship-%s tid:%d:%s%s", dbName[trace_level]._color, dbName[trace_level]._shortName, tls_tidInfo.tid(), msgStr, KNRM); \ }\ } @@ -269,7 +269,7 @@ extern uint64_t recordApiTrace(std::string *fullStr, const std::string &apiStr); #if COMPILE_HIP_ATP_MARKER || (COMPILE_HIP_TRACE_API & 0x1) #define API_TRACE(forceTrace, ...)\ -uint64_t hipApiStartTick;\ +uint64_t hipApiStartTick=0;\ {\ tls_tidInfo.incApiSeqNum();\ if (forceTrace || (HIP_PROFILE_API || (COMPILE_HIP_DB && (HIP_TRACE_API & (1<psize = psize; From 9bba97fdcc825398ba443942b0c0375aa0bba329 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 21 Nov 2017 16:44:28 -0600 Subject: [PATCH 31/35] Fix some cppcheck style issues. --- src/hip_device.cpp | 6 +++--- src/hip_hcc.cpp | 2 ++ src/hip_hcc_internal.h | 11 ++++++----- src/hip_memory.cpp | 26 +++++++++++++------------- 4 files changed, 24 insertions(+), 21 deletions(-) diff --git a/src/hip_device.cpp b/src/hip_device.cpp index 521b56b0e9..4f08d49bf3 100644 --- a/src/hip_device.cpp +++ b/src/hip_device.cpp @@ -446,14 +446,14 @@ hipError_t hipChooseDevice( int* device, const hipDeviceProp_t* prop ) { HIP_INIT_API(device,prop); hipDeviceProp_t tempProp; - int deviceCount; - int inPropCount = 0; - int matchedPropCount = 0; hipError_t e = hipSuccess; if((device == NULL) || (prop == NULL)) { e = hipErrorInvalidValue; } if(e == hipSuccess) { + int deviceCount; + int inPropCount = 0; + int matchedPropCount = 0; ihipGetDeviceCount( &deviceCount ); *device = 0; for (int i = 0; i < deviceCount; i++) { diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 2eff57f18b..f15a0eb1d8 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -779,6 +779,8 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop) // Get agent name err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, &(prop->name)); + DeviceErrorCheck(err); + char archName[256]; err = hsa_agent_get_info(_hsaAgent, HSA_AGENT_INFO_NAME, &archName); diff --git a/src/hip_hcc_internal.h b/src/hip_hcc_internal.h index 69434a6327..3fbd3cfacc 100644 --- a/src/hip_hcc_internal.h +++ b/src/hip_hcc_internal.h @@ -339,7 +339,7 @@ uint64_t hipApiStartTick=0;\ class ihipException : public std::exception { public: - ihipException(hipError_t e) : _code(e) {}; + explicit ihipException(hipError_t e) : _code(e) {}; hipError_t _code; }; @@ -669,7 +669,7 @@ template class ihipEventCriticalBase_t : LockedBase { public: - ihipEventCriticalBase_t(const ihipEvent_t *parentEvent) : + explicit ihipEventCriticalBase_t(const ihipEvent_t *parentEvent) : _parent(parentEvent) {} ~ihipEventCriticalBase_t() {}; @@ -690,7 +690,7 @@ typedef LockedAccessor LockedAccessor_EventCrit_t; // internal hip event structure. class ihipEvent_t { public: - ihipEvent_t(unsigned flags); + explicit ihipEvent_t(unsigned flags); void attachToCompletionFuture(const hc::completion_future *cf, hipStream_t stream, ihipEventType_t eventType); std::pair refreshEventStatus(); // returns pair @@ -720,8 +720,9 @@ template class ihipDeviceCriticalBase_t : LockedBase { public: - ihipDeviceCriticalBase_t(ihipDevice_t *parentDevice) : - _parent(parentDevice) + explicit ihipDeviceCriticalBase_t(ihipDevice_t *parentDevice) : + _parent(parentDevice), + _ctxCount(0) { }; diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index 8ed5e600e2..cedc3c59b5 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -44,7 +44,7 @@ hipError_t memcpyAsync (void* dst, const void* src, size_t sizeBytes, hipMemcpyK try { stream->locked_copyAsync(dst, src, sizeBytes, kind); } - catch (ihipException ex) { + catch (ihipException &ex) { e = ex._code; } } else { @@ -928,7 +928,7 @@ hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void *src, size_ try { stream->lockedSymbolCopyAsync(acc, dst, (void*)src, count, offset, kind); } - catch (ihipException ex) { + catch (ihipException &ex) { e = ex._code; } } else { @@ -968,7 +968,7 @@ hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName, size_t co try { stream->lockedSymbolCopyAsync(acc, dst, src, count, offset, kind); } - catch (ihipException ex) { + catch (ihipException &ex) { e = ex._code; } } else { @@ -993,7 +993,7 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind stream->locked_copySync(dst, src, sizeBytes, kind); } - catch (ihipException ex) { + catch (ihipException &ex) { e = ex._code; } @@ -1015,7 +1015,7 @@ hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t sizeBytes) stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyHostToDevice, false); } - catch (ihipException ex) { + catch (ihipException &ex) { e = ex._code; } @@ -1037,7 +1037,7 @@ hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t sizeBytes) stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyDeviceToHost, false); } - catch (ihipException ex) { + catch (ihipException &ex) { e = ex._code; } @@ -1059,7 +1059,7 @@ hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeByte stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyDeviceToDevice, false); } - catch (ihipException ex) { + catch (ihipException &ex) { e = ex._code; } @@ -1081,7 +1081,7 @@ hipError_t hipMemcpyHtoH(void* dst, void* src, size_t sizeBytes) stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyHostToHost, false); } - catch (ihipException ex) { + catch (ihipException &ex) { e = ex._code; } @@ -1140,7 +1140,7 @@ hipError_t ihipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch stream->locked_copySync((unsigned char*)dst + i*dpitch, (unsigned char*)src + i*spitch, width, kind); } } - catch (ihipException ex) { + catch (ihipException &ex) { e = ex._code; } @@ -1178,7 +1178,7 @@ hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t sp e = hip_internal::memcpyAsync((unsigned char*)dst + i*dpitch, (unsigned char*)src + i*spitch, width, kind,stream); } } - catch (ihipException ex) { + catch (ihipException &ex) { e = ex._code; } @@ -1231,7 +1231,7 @@ hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset, con stream->locked_copySync((unsigned char*)dst->data + i*dst_w, (unsigned char*)src + i*src_w, width, kind); } } - catch (ihipException ex) { + catch (ihipException &ex) { e = ex._code; } @@ -1252,7 +1252,7 @@ hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset, try { stream->locked_copySync((char *)dst->data + wOffset, src, count, kind); } - catch (ihipException ex) { + catch (ihipException &ex) { e = ex._code; } @@ -1302,7 +1302,7 @@ hipError_t hipMemcpy3D(const struct hipMemcpy3DParms *p) } } } - catch (ihipException ex) { + catch (ihipException &ex) { e = ex._code; } From 62cbe22d5089c4b7bab6c7ed7e95fd1fa4924298 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 1 Dec 2017 21:46:28 +0000 Subject: [PATCH 32/35] Temporarily disable a couple tests pending some HCC work --- tests/src/deviceLib/hipTestDeviceDouble.cpp | 4 ++-- tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/src/deviceLib/hipTestDeviceDouble.cpp b/tests/src/deviceLib/hipTestDeviceDouble.cpp index 0b5d18fe5a..659d53cbb3 100644 --- a/tests/src/deviceLib/hipTestDeviceDouble.cpp +++ b/tests/src/deviceLib/hipTestDeviceDouble.cpp @@ -18,8 +18,8 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s ../test_common.cpp - * RUN: %t + * XXBUILD: %t %s ../test_common.cpp + * XXRUN: %t * HIT_END */ diff --git a/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp b/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp index a7a930b4f6..a234e235be 100644 --- a/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp +++ b/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp @@ -18,8 +18,8 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 - * RUN: %t + * ZZZBUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 + * ZZZRUN: %t * HIT_END */ From 6027d3f332df53b31f31b71b30a6378f6f1d1699 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Sat, 2 Dec 2017 00:01:47 +0000 Subject: [PATCH 33/35] Replace archaic use of homebrew functionality with calls to the HC maths library. This fixes a hang observed when building hipTestDeviceDouble. --- src/device_util.cpp | 84 +----------------------------------------- src/math_functions.cpp | 11 ++---- 2 files changed, 5 insertions(+), 90 deletions(-) diff --git a/src/device_util.cpp b/src/device_util.cpp index b6aebdfce0..6edad53bcb 100644 --- a/src/device_util.cpp +++ b/src/device_util.cpp @@ -147,91 +147,11 @@ __device__ void* __hip_hc_memset(void* dst, uint8_t val, size_t size) } __device__ float __hip_erfinvf(float x){ - float ret; - int sign; - if (x < -1 || x > 1){ - return NAN; - } - if (x == 0){ - return 0; - } - if (x > 0){ - sign = 1; - } else { - sign = -1; - x = -x; - } - if (x <= 0.7) { - float x1 = x * x; - float x2 = __hip_erfinva3 * x1 + __hip_erfinva2; - float x3 = x2 * x1 + __hip_erfinva1; - float x4 = x * (x3 * x1 + __hip_erfinva0); - - float r1 = __hip_erfinvb4 * x1 + __hip_erfinvb3; - float r2 = r1 * x1 + __hip_erfinvb2; - float r3 = r2 * x1 + __hip_erfinvb1; - ret = x4 / (r3 * x1 + __hip_erfinvb0); - } else { - float x1 = hc::precise_math::sqrtf(-hc::precise_math::logf((1 - x) / 2)); - float x2 = __hip_erfinvc3 * x1 + __hip_erfinvc2; - float x3 = x2 * x1 + __hip_erfinvc1; - float x4 = x3 * x1 + __hip_erfinvc0; - - float r1 = __hip_erfinvd2 * x1 + __hip_erfinvd1; - ret = x4 / (r1 * x1 + __hip_erfinvd0); - } - - ret = ret * sign; - x = x * sign; - - ret -= (hc::precise_math::erff(ret) - x) / (2 / HIP_SQRT_PI * hc::precise_math::expf(-ret * ret)); - ret -= (hc::precise_math::erff(ret) - x) / (2 / HIP_SQRT_PI * hc::precise_math::expf(-ret * ret)); - - return ret; + return hc::precise_math::erfinvf(x); } __device__ double __hip_erfinv(double x){ - double ret; - int sign; - if (x < -1 || x > 1){ - return NAN; - } - if (x == 0){ - return 0; - } - if (x > 0){ - sign = 1; - } else { - sign = -1; - x = -x; - } - if (x <= 0.7) { - double x1 = x * x; - double x2 = __hip_erfinva3 * x1 + __hip_erfinva2; - double x3 = x2 * x1 + __hip_erfinva1; - double x4 = x * (x3 * x1 + __hip_erfinva0); - - double r1 = __hip_erfinvb4 * x1 + __hip_erfinvb3; - double r2 = r1 * x1 + __hip_erfinvb2; - double r3 = r2 * x1 + __hip_erfinvb1; - ret = x4 / (r3 * x1 + __hip_erfinvb0); - } else { - double x1 = hc::precise_math::sqrt(-hc::precise_math::log((1 - x) / 2)); - double x2 = __hip_erfinvc3 * x1 + __hip_erfinvc2; - double x3 = x2 * x1 + __hip_erfinvc1; - double x4 = x3 * x1 + __hip_erfinvc0; - - double r1 = __hip_erfinvd2 * x1 + __hip_erfinvd1; - ret = x4 / (r1 * x1 + __hip_erfinvd0); - } - - ret = ret * sign; - x = x * sign; - - ret -= (hc::precise_math::erf(ret) - x) / (2 / HIP_SQRT_PI * hc::precise_math::exp(-ret * ret)); - ret -= (hc::precise_math::erf(ret) - x) / (2 / HIP_SQRT_PI * hc::precise_math::exp(-ret * ret)); - - return ret; + return hc::precise_math::erfinv(x); } #define __hip_j0a1 57568490574.0 diff --git a/src/math_functions.cpp b/src/math_functions.cpp index 80ccece1a3..9dd27a7082 100644 --- a/src/math_functions.cpp +++ b/src/math_functions.cpp @@ -84,7 +84,7 @@ __device__ float erfcf(float x) } __device__ float erfcinvf(float y) { - return __hip_erfinvf(1 - y); + return hc::precise_math::erfcinvf(y); } __device__ float erfcxf(float x) { @@ -96,7 +96,7 @@ __device__ float erff(float x) } __device__ float erfinvf(float y) { - return __hip_erfinvf(y); + return hc::precise_math::erfinvf(y);//__hip_erfinvf(y); } __device__ float exp10f(float x) { @@ -192,12 +192,7 @@ __device__ float ldexpf(float x, int exp) } __device__ float lgammaf(float x) { - float val = 0.0f; - float y = x - 1; - while(y > 0){ - val += logf(y--); - } - return val; + return hc::precise_math::lgammaf(x); } __device__ long long int llrintf(float x) { From 02fc25de4cb20e452951ca738917b9c4a4ac7691 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Sat, 2 Dec 2017 00:03:10 +0000 Subject: [PATCH 34/35] Remove stray leftover comment. --- src/math_functions.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/math_functions.cpp b/src/math_functions.cpp index 9dd27a7082..9118318469 100644 --- a/src/math_functions.cpp +++ b/src/math_functions.cpp @@ -96,7 +96,7 @@ __device__ float erff(float x) } __device__ float erfinvf(float y) { - return hc::precise_math::erfinvf(y);//__hip_erfinvf(y); + return hc::precise_math::erfinvf(y); } __device__ float exp10f(float x) { From 5127ce67e843ebc22f0262d8091f2c46817ccb5d Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Sun, 3 Dec 2017 23:09:06 +0000 Subject: [PATCH 35/35] This is primarily intended as an additional cleanup of the module functionality, in the aftermath of adopting module based dispatch. The main effort was associated with refactoring the questionable ihipModuleGetSymbol. It was quaintly written and misleading, in that it had little to do with getting symbols, and was exactly retrieving a kernel object. Error handling is modified so as to reduce branching depth. Functions which serve as interfaces to the HSA RT are moved in a separate helper header. Code object readers are properly deleted. Some leftover dead functionality pertaining to associating namespace scope variables with their allocated memory is removed. Executable loading is changed to use a string which holds the ELF image of the code object being loaded, thus avoiding some corner cases where using a istream would fail. --- include/hip/hcc_detail/program_state.hpp | 4 +- src/hip_hcc_internal.h | 24 +- src/hip_module.cpp | 526 ++++++++--------------- src/hsa_helpers.hpp | 112 +++++ src/program_state.cpp | 135 ++---- 5 files changed, 340 insertions(+), 461 deletions(-) create mode 100644 src/hsa_helpers.hpp diff --git a/include/hip/hcc_detail/program_state.hpp b/include/hip/hcc_detail/program_state.hpp index 65896e97a7..02e2f1e524 100644 --- a/include/hip/hcc_detail/program_state.hpp +++ b/include/hip/hcc_detail/program_state.hpp @@ -80,5 +80,7 @@ namespace hip_impl std::unordered_map& globals(); hsa_executable_t load_executable( - hsa_executable_t executable, hsa_agent_t agent, std::istream& file); + const std::string& file, + hsa_executable_t executable, + hsa_agent_t agent); } // Namespace hip_impl. \ No newline at end of file diff --git a/src/hip_hcc_internal.h b/src/hip_hcc_internal.h index 3fd09630d9..d3f4c8d584 100644 --- a/src/hip_hcc_internal.h +++ b/src/hip_hcc_internal.h @@ -372,16 +372,16 @@ public: }; -class ihipModule_t { -public: - hsa_executable_t executable; - hsa_code_object_t object; +struct ihipModule_t { std::string fileName; - void *ptr; - size_t size; - std::list funcTrack; - std::unordered_map coGlobals; - ihipModule_t() : executable(), object(), fileName(), ptr(nullptr), size(0) {} + hsa_executable_t executable = {}; + hsa_code_object_reader_t coReader = {}; + + ~ihipModule_t() + { + if (executable.handle) hsa_executable_destroy(executable); + if (coReader.handle) hsa_code_object_reader_destroy(coReader); + } }; @@ -669,11 +669,11 @@ template class ihipEventCriticalBase_t : LockedBase { public: - ihipEventCriticalBase_t(const ihipEvent_t *parentEvent) : + ihipEventCriticalBase_t(const ihipEvent_t *parentEvent) : _parent(parentEvent) {} ~ihipEventCriticalBase_t() {}; - + // Keep data in structure so it can be easily copied into snapshots // (used to reduce lock contention and preserve correct lock order) ihipEventData_t _eventData; @@ -698,7 +698,7 @@ public: // Return a copy of the critical state. The critical data is locked during the copy. ihipEventData_t locked_copyCrit() { LockedAccessor_EventCrit_t crit(_criticalData); - return _criticalData._eventData; + return _criticalData._eventData; }; ihipEventCritical_t &criticalData() { return _criticalData; }; diff --git a/src/hip_module.cpp b/src/hip_module.cpp index 1477247ae2..45a44b3666 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -20,63 +20,65 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include "elfio/elfio.hpp" +#include "hip/hip_runtime.h" +#include "hip/hcc_detail/program_state.hpp" +#include "hip_hcc_internal.h" +#include "hsa_helpers.hpp" +#include "trace_helper.h" #include #include #include -#include "elfio/elfio.hpp" -#include "hip/hip_runtime.h" -#include "hip/hcc_detail/program_state.hpp" -#include "hip_hcc_internal.h" -#include "trace_helper.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include //TODO Use Pool APIs from HCC to get memory regions. -#include +using namespace ELFIO; +using namespace hip_impl; +using namespace std; + inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) { assert(Align != 0u && "Align can't be 0."); Skew %= Align; return (Value + Align - 1 - Skew) / Align * Align + Skew; } + struct ihipKernArgInfo{ - std::vector Size; - std::vector Align; - std::vector ArgType; - std::vector ArgName; + vector Size; + vector Align; + vector ArgType; + vector ArgName; uint32_t totalSize; }; -std::map kernelArguments; - -struct MyElfNote { - uint32_t n_namesz = 0; - uint32_t n_descsz = 0; - uint32_t n_type = 0; - - MyElfNote() = default; -}; +map kernelArguments; struct ihipModuleSymbol_t{ - uint64_t _object; // The kernel object. - uint32_t _groupSegmentSize; - uint32_t _privateSegmentSize; - std::string _name; // TODO - review for performance cost. Name is just used for debug. + uint64_t _object; // The kernel object. + uint32_t _groupSegmentSize; + uint32_t _privateSegmentSize; + string _name; // TODO - review for performance cost. Name is just used for debug. }; template <> -std::string ToString(hipFunction_t v) +string ToString(hipFunction_t v) { std::ostringstream ss; ss << "0x" << std::hex << v->_object; @@ -94,113 +96,20 @@ if (hsaStatus != HSA_STATUS_SUCCESS) {\ return ihipLogStatus(hipStatus);\ } -namespace hipdrv { - - hsa_status_t findSystemRegions(hsa_region_t region, void *data){ - hsa_region_segment_t segment_id; - hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment_id); - - if(segment_id != HSA_REGION_SEGMENT_GLOBAL){ - return HSA_STATUS_SUCCESS; - } - - hsa_region_global_flag_t flags; - hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags); - - hsa_region_t *reg = (hsa_region_t*)data; - - if(flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED){ - *reg = region; - } - - return HSA_STATUS_SUCCESS; - } - -} // End namespace hipdrv - -uint64_t PrintSymbolSizes(const void *emi, const char *name){ - using namespace ELFIO; - - const Elf64_Ehdr *ehdr = (const Elf64_Ehdr*)emi; - if(NULL == ehdr || EV_CURRENT != ehdr->e_version){} - const Elf64_Shdr * shdr = (const Elf64_Shdr*)((char*)emi + ehdr->e_shoff); - for(uint16_t i=0;ie_shnum;++i){ - if(shdr[i].sh_type == SHT_SYMTAB){ - const Elf64_Sym *syms = (const Elf64_Sym*)((char*)emi + shdr[i].sh_offset); - assert(syms); - uint64_t numSyms = shdr[i].sh_size/shdr[i].sh_entsize; - const char* strtab = (const char*)((char*)emi + shdr[shdr[i].sh_link].sh_offset); - assert(strtab); - for(uint64_t i=0;ie_shoff); - - uint64_t max_offset = ehdr->e_shoff; - uint64_t total_size = max_offset + ehdr->e_shentsize * ehdr->e_shnum; - - for(uint16_t i=0;i < ehdr->e_shnum;++i){ - uint64_t cur_offset = static_cast(shdr[i].sh_offset); - if(max_offset < cur_offset){ - max_offset = cur_offset; - total_size = max_offset; - if(SHT_NOBITS != shdr[i].sh_type){ - total_size += static_cast(shdr[i].sh_size); - } - } - } - return total_size; -} - hipError_t hipModuleLoad(hipModule_t *module, const char *fname) { HIP_INIT_API(module, fname); - hipError_t ret = hipSuccess; - *module = new ihipModule_t; - if(module == NULL){ - return ihipLogStatus(hipErrorInvalidValue); - } + if (!fname) return ihipLogStatus(hipErrorInvalidValue); - auto ctx = ihipGetTlsDefaultCtx(); - if(ctx == nullptr){ - ret = hipErrorInvalidContext; + ifstream file{fname}; - }else{ - int deviceId = ctx->getDevice()->_deviceId; - ihipDevice_t *currentDevice = ihipGetDevice(deviceId); + if (!file.is_open()) return ihipLogStatus(hipErrorFileNotFound); - hsa_executable_create_alt( - HSA_PROFILE_FULL, - HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, - nullptr, - &(*module)->executable); + vector tmp{ + istreambuf_iterator{file}, istreambuf_iterator{}}; - std::ifstream file{fname}; - - if (!file.is_open()) { - return ihipLogStatus(hipErrorFileNotFound); - } - (*module)->executable = hip_impl::load_executable( - (*module)->executable, currentDevice->_hsaAgent, file); - ret = (*module)->executable.handle ? hipSuccess : hipErrorUnknown; - } - - return ihipLogStatus(ret); + return hipModuleLoadData(module, tmp.data()); } @@ -212,92 +121,13 @@ hipError_t hipModuleUnload(hipModule_t hmod) // Currently we want for all inflight activity to complete, but don't prevent another // thread from launching new kernels before we finish this operation. ihipSynchronize(); - hipError_t ret = hipSuccess; - hsa_status_t status = hsa_executable_destroy(hmod->executable); - if(status != HSA_STATUS_SUCCESS) - { - ret = hipErrorInvalidValue; - } - // status = hsa_code_object_destroy(hmod->object); - // if(status != HSA_STATUS_SUCCESS) - // { - // ret = hipErrorInvalidValue; - // } - // status = hsa_memory_free(hmod->ptr); - // if(status != HSA_STATUS_SUCCESS) - // { - // ret = hipErrorInvalidValue; - // } - for(auto f = hmod->funcTrack.begin(); f != hmod->funcTrack.end(); ++f) { - delete *f; - } - delete hmod; - return ihipLogStatus(ret); + + delete hmod; // The ihipModule_t dtor will clean everything up. + hmod = nullptr; + + return ihipLogStatus(hipSuccess); } - -hipError_t ihipModuleGetSymbol(hipFunction_t *func, hipModule_t hmod, const char *name) -{ - auto ctx = ihipGetTlsDefaultCtx(); - hipError_t ret = hipSuccess; - - if (name == nullptr){ - return (hipErrorInvalidValue); - } - - if (ctx == nullptr){ - ret = hipErrorInvalidContext; - - } else { - std::string str(name); - for(auto f = hmod->funcTrack.begin(); f != hmod->funcTrack.end(); ++f) { - if((*f)->_name == str) { - *func = *f; - return ret; - } - } - ihipModuleSymbol_t *sym = new ihipModuleSymbol_t; - int deviceId = ctx->getDevice()->_deviceId; - ihipDevice_t *currentDevice = ihipGetDevice(deviceId); - hsa_agent_t gpuAgent = (hsa_agent_t)currentDevice->_hsaAgent; - - hsa_status_t status; - hsa_executable_symbol_t symbol; - status = hsa_executable_get_symbol(hmod->executable, NULL, name, gpuAgent, 0, &symbol); - if(status != HSA_STATUS_SUCCESS){ - return hipErrorNotFound; - } - - status = hsa_executable_symbol_get_info(symbol, - HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, - &sym->_object); - CHECK_HSA(status, hipErrorNotFound); - - status = hsa_executable_symbol_get_info(symbol, - HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, - &sym->_groupSegmentSize); - CHECK_HSA(status, hipErrorNotFound); - - status = hsa_executable_symbol_get_info(symbol, - HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, - &sym->_privateSegmentSize); - CHECK_HSA(status, hipErrorNotFound); - - sym->_name = name; - *func = sym; - hmod->funcTrack.push_back(*func); - } - return ret; -} - - -hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod, - const char *name){ - HIP_INIT_API(hfunc, hmod, name); - return ihipLogStatus(ihipModuleGetSymbol(hfunc, hmod, name)); -} - - hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ, @@ -448,45 +278,11 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, namespace { struct Agent_global { - std::string name; + string name; hipDeviceptr_t address; - std::uint32_t byte_cnt; + uint32_t byte_cnt; }; - inline - void* address(hsa_executable_symbol_t x) - { - void* r = nullptr; - hsa_executable_symbol_get_info( - x, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &r); - - return r; - } - - inline - std::string name(hsa_executable_symbol_t x) - { - uint32_t sz = 0u; - hsa_executable_symbol_get_info( - x, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &sz); - - std::string r(sz, '\0'); - hsa_executable_symbol_get_info( - x, HSA_EXECUTABLE_SYMBOL_INFO_NAME, &r.front()); - - return r; - } - - inline - std::uint32_t size(hsa_executable_symbol_t x) - { - std::uint32_t r = 0; - hsa_executable_symbol_get_info( - x, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &r); - - return r; - } - inline void track(const Agent_global& x) { @@ -511,7 +307,7 @@ namespace hc::am_memtracker_update(x.address, device->_deviceId, 0u); } - template> + template> inline hsa_status_t copy_agent_global_variables( hsa_executable_t, hsa_agent_t, hsa_executable_symbol_t x, void* out) @@ -536,26 +332,24 @@ namespace { auto ctx = ihipGetTlsDefaultCtx(); - if (!ctx) throw std::runtime_error{"No active HIP context."}; + if (!ctx) throw runtime_error{"No active HIP context."}; auto device = ctx->getDevice(); - if (!device) throw std::runtime_error{"No device available for HIP."}; + if (!device) throw runtime_error{"No device available for HIP."}; ihipDevice_t *currentDevice = ihipGetDevice(device->_deviceId); - if (!currentDevice) { - throw std::runtime_error{"No active device for HIP"}; - } + if (!currentDevice) throw runtime_error{"No active device for HIP."}; return currentDevice->_hsaAgent; } inline - std::vector read_agent_globals( + vector read_agent_globals( hsa_agent_t agent, hsa_executable_t executable) { - std::vector r; + vector r; hsa_executable_iterate_agent_symbols( executable, agent, copy_agent_global_variables, &r); @@ -564,15 +358,14 @@ namespace } template - std::pair read_global_description( + pair read_global_description( ForwardIterator f, ForwardIterator l, const char* name) { const auto it = std::find_if( f, l, [=](const Agent_global& x) { return x.name == name; }); return it == l ? - std::make_pair(nullptr, 0u) : - std::make_pair(it->address, it->byte_cnt); + make_pair(nullptr, 0u) : make_pair(it->address, it->byte_cnt); } hipError_t read_agent_global_from_module( @@ -581,13 +374,12 @@ namespace hipModule_t hmod, const char* name) { - static std::unordered_map< - hipModule_t, std::vector> agent_globals; + static unordered_map> agent_globals; // TODO: this is not particularly robust. if (agent_globals.count(hmod) == 0) { - static std::mutex mtx; - std::lock_guard lck{mtx}; + static mutex mtx; + lock_guard lck{mtx}; if (agent_globals.count(hmod) == 0) { agent_globals.emplace( @@ -599,10 +391,10 @@ namespace // It will have to be properly fleshed out in the future. const auto it0 = agent_globals.find(hmod); if (it0 == agent_globals.cend()) { - throw std::runtime_error{"agent_globals data structure corrupted."}; + throw runtime_error{"agent_globals data structure corrupted."}; } - std::tie(*dptr, *bytes) = read_global_description( + tie(*dptr, *bytes) = read_global_description( it0->second.cbegin(), it0->second.cend(), name); return dptr ? hipSuccess : hipErrorNotFound; @@ -611,22 +403,21 @@ namespace hipError_t read_agent_global_from_process( hipDeviceptr_t *dptr, size_t* bytes, const char* name) { - static std::unordered_map< - hsa_agent_t, std::vector> agent_globals; + static unordered_map> agent_globals; static std::once_flag f; - std::call_once(f, []() { + call_once(f, []() { for (auto&& agent_executables : hip_impl::executables()) { - std::vector tmp0; + vector tmp0; for (auto&& executable : agent_executables.second) { auto tmp1 = read_agent_globals( agent_executables.first, executable); tmp0.insert( tmp0.end(), - std::make_move_iterator(tmp1.begin()), - std::make_move_iterator(tmp1.end())); + make_move_iterator(tmp1.begin()), + make_move_iterator(tmp1.end())); } - agent_globals.emplace(agent_executables.first, std::move(tmp0)); + agent_globals.emplace(agent_executables.first, move(tmp0)); } }); @@ -634,81 +425,129 @@ namespace if (it == agent_globals.cend()) return hipErrorNotInitialized; - std::tie(*dptr, *bytes) = read_global_description( + tie(*dptr, *bytes) = read_global_description( it->second.cbegin(), it->second.cend(), name); return dptr ? hipSuccess : hipErrorNotFound; } + + hsa_executable_symbol_t find_kernel_by_name( + hsa_executable_t executable, const char* kname) + { + pair r{kname, {}}; + + hsa_executable_iterate_agent_symbols( + executable, + this_agent(), + [](hsa_executable_t, hsa_agent_t, hsa_executable_symbol_t x, void* s) { + auto p = + static_cast*>(s); + + if (type(x) != HSA_SYMBOL_KIND_KERNEL) { + return HSA_STATUS_SUCCESS; + } + if (name(x) != p->first) return HSA_STATUS_SUCCESS; + + p->second = x; + + return HSA_STATUS_INFO_BREAK; + }, &r); + + return r.second; + } + + string read_elf_file_as_string(const void* file) + { // Precondition: file points to an ELF image that was BITWISE loaded + // into process accessible memory, and not one loaded by + // the loader. This is because in the latter case + // alignment may differ, which will break the size + // computation. + // the image is Elf64, and matches endianness i.e. it is + // Little Endian. + if (!file) return {}; + + auto h = static_cast(file); + auto s = static_cast(file); + // This assumes the common case of SHT being the last part of the ELF. + auto sz = sizeof(Elf64_Ehdr) + h->e_shoff + h->e_shentsize * h->e_shnum; + + return string{s, s + sz}; + } +} // Anonymous namespace, internal linkage. + +hipError_t ihipModuleGetFunction( + hipFunction_t *func, hipModule_t hmod, const char *name) +{ + HIP_INIT_API(func, hmod, name); + + if (!func || !name) return ihipLogStatus(hipErrorInvalidValue); + + auto ctx = ihipGetTlsDefaultCtx(); + + if (!ctx) return ihipLogStatus(hipErrorInvalidContext); + + hipError_t ret = hipSuccess; + + *func = new ihipModuleSymbol_t; + + if (!*func) return ihipLogStatus(hipErrorInvalidValue); + + auto kernel = find_kernel_by_name(hmod->executable, name); + + if (kernel.handle == 0u) return ihipLogStatus(hipErrorNotFound); + + (*func)->_object = kernel_object(kernel); + (*func)->_groupSegmentSize = group_size(kernel); + (*func)->_privateSegmentSize = private_size(kernel); + (*func)->_name = name; + + return ihipLogStatus(hipSuccess); +} + +hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod, + const char *name){ + HIP_INIT_API(hfunc, hmod, name); + return ihipLogStatus(ihipModuleGetFunction(hfunc, hmod, name)); } hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, hipModule_t hmod, const char* name) { HIP_INIT_API(dptr, bytes, hmod, name); - hipError_t ret = hipSuccess; - if(dptr == NULL || bytes == NULL){ - return ihipLogStatus(hipErrorInvalidValue); - } - if(name == NULL){ - return ihipLogStatus(hipErrorNotInitialized); - } - else{ - ret = hmod ? - read_agent_global_from_module(dptr, bytes, hmod, name) : - read_agent_global_from_process(dptr, bytes, name); - return ihipLogStatus(ret); - } + if(!dptr || !bytes) return ihipLogStatus(hipErrorInvalidValue); + + if(!name) return ihipLogStatus(hipErrorNotInitialized); + + const auto r = hmod ? + read_agent_global_from_module(dptr, bytes, hmod, name) : + read_agent_global_from_process(dptr, bytes, name); + + return ihipLogStatus(r); } hipError_t hipModuleLoadData(hipModule_t *module, const void *image) { HIP_INIT_API(module, image); - hipError_t ret = hipSuccess; - if(image == NULL || module == NULL){ - return ihipLogStatus(hipErrorNotInitialized); - } else { - auto ctx = ihipGetTlsDefaultCtx(); - *module = new ihipModule_t; - int deviceId = ctx->getDevice()->_deviceId; - ihipDevice_t *currentDevice = ihipGetDevice(deviceId); - void *p; - uint64_t size = ElfSize(image); - hsa_agent_t agent = currentDevice->_hsaAgent; - hsa_region_t sysRegion; - hsa_status_t status = hsa_agent_iterate_regions(agent, hipdrv::findSystemRegions, &sysRegion); - status = hsa_memory_allocate(sysRegion, size, (void**)&p); + if (!module) return ihipLogStatus(hipErrorInvalidValue); - if(status != HSA_STATUS_SUCCESS){ - return ihipLogStatus(hipErrorOutOfMemory); - } + *module = new ihipModule_t; - char *ptr = (char*)p; - if(!ptr){ - return ihipLogStatus(hipErrorOutOfMemory); - } - (*module)->ptr = p; - (*module)->size = size; + auto ctx = ihipGetTlsDefaultCtx(); + if (!ctx) return ihipLogStatus(hipErrorInvalidContext); - memcpy(ptr, image, size); + hsa_executable_create_alt( + HSA_PROFILE_FULL, + HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, + nullptr, + &(*module)->executable); - status = hsa_code_object_deserialize(ptr, size, NULL, &(*module)->object); + (*module)->executable = hip_impl::load_executable( + read_elf_file_as_string(image), (*module)->executable, this_agent()); - if(status != HSA_STATUS_SUCCESS){ - return ihipLogStatus(hipErrorSharedObjectInitFailed); - } - - status = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, NULL, &(*module)->executable); - CHECKLOG_HSA(status, hipErrorNotInitialized); - - status = hsa_executable_load_code_object((*module)->executable, agent, (*module)->object, NULL); - CHECKLOG_HSA(status, hipErrorNotInitialized); - - status = hsa_executable_freeze((*module)->executable, NULL); - CHECKLOG_HSA(status, hipErrorNotInitialized); - } - return ihipLogStatus(ret); + return ihipLogStatus( + (*module)->executable.handle ? hipSuccess : hipErrorUnknown); } hipError_t hipModuleLoadDataEx(hipModule_t *module, const void *image, unsigned int numOptions, hipJitOption *options, void **optionValues) @@ -716,21 +555,20 @@ hipError_t hipModuleLoadDataEx(hipModule_t *module, const void *image, unsigned return hipModuleLoadData(module, image); } -hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const char* name) +hipError_t hipModuleGetTexRef( + textureReference** texRef, hipModule_t hmod, const char* name) { HIP_INIT_API(texRef, hmod, name); + hipError_t ret = hipErrorNotFound; - if(texRef == NULL){ - ret = hipErrorInvalidValue; - } else { - if(name == NULL || hmod == NULL){ - ret = hipErrorNotInitialized; - } else{ - const auto it = hip_impl::globals().find(name); - if (it == hip_impl::globals().end()) return ihipLogStatus(hipErrorInvalidValue); - *texRef = reinterpret_cast(it->second.get()); - ret = hipSuccess; - } - } - return ihipLogStatus(ret); + if(!texRef) return ihipLogStatus(hipErrorInvalidValue); + + if(!hmod || !name) return ihipLogStatus(hipErrorNotInitialized); + + const auto it = globals().find(name); + if (it == globals().end()) return ihipLogStatus(hipErrorInvalidValue); + + *texRef = static_cast(it->second.get()); + + return ihipLogStatus(hipSuccess); } diff --git a/src/hsa_helpers.hpp b/src/hsa_helpers.hpp new file mode 100644 index 0000000000..d8e09b7aa9 --- /dev/null +++ b/src/hsa_helpers.hpp @@ -0,0 +1,112 @@ +/* +Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#pragma once + +#include + +#include +#include + +namespace hip_impl +{ + inline + void* address(hsa_executable_symbol_t x) + { + void* r = nullptr; + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &r); + + return r; + } + + inline + hsa_agent_t agent(hsa_executable_symbol_t x) + { + hsa_agent_t r = {}; + hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_AGENT, &r); + + return r; + } + + inline + std::uint32_t group_size(hsa_executable_symbol_t x) + { + std::uint32_t r = 0u; + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &r); + + return r; + } + + inline + std::uint64_t kernel_object(hsa_executable_symbol_t x) + { + std::uint64_t r = 0u; + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &r); + + return r; + } + + inline + std::string name(hsa_executable_symbol_t x) + { + std::uint32_t sz = 0u; + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &sz); + + std::string r(sz, '\0'); + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_NAME, &r.front()); + + return r; + } + + inline + std::uint32_t private_size(hsa_executable_symbol_t x) + { + std::uint32_t r = 0u; + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &r); + + return r; + } + + inline + std::uint32_t size(hsa_executable_symbol_t x) + { + std::uint32_t r = 0; + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &r); + + return r; + } + + inline + hsa_symbol_kind_t type(hsa_executable_symbol_t x) + { + hsa_symbol_kind_t r = {}; + hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &r); + + return r; + } +} \ No newline at end of file diff --git a/src/program_state.cpp b/src/program_state.cpp index 47071d0236..e867887da2 100644 --- a/src/program_state.cpp +++ b/src/program_state.cpp @@ -3,6 +3,7 @@ #include "../include/hip/hcc_detail/code_object_bundle.hpp" #include "hip_hcc_internal.h" +#include "hsa_helpers.hpp" #include "trace_helper.h" #include "elfio/elfio.hpp" @@ -146,13 +147,11 @@ namespace void associate_code_object_symbols_with_host_allocation( const elfio& reader, - const elfio& self_reader, section* code_object_dynsym, - section* process_symtab, hsa_agent_t agent, hsa_executable_t executable) { - if (!code_object_dynsym || !process_symtab) return; + if (!code_object_dynsym) return; const auto undefined_symbols = copy_names_of_undefined_symbols( symbol_section_accessor{reader, code_object_dynsym}); @@ -294,68 +293,6 @@ namespace return r; } - inline - hsa_agent_t agent(hsa_executable_symbol_t x) - { - hsa_agent_t r = {}; - hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_AGENT, &r); - - return r; - } - - inline - uint32_t group_size(hsa_executable_symbol_t x) - { - uint32_t r = 0u; - hsa_executable_symbol_get_info( - x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &r); - - return r; - } - - inline - uint64_t kernel_object(hsa_executable_symbol_t x) - { - uint64_t r = 0u; - hsa_executable_symbol_get_info( - x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &r); - - return r; - } - - inline - string name(hsa_executable_symbol_t x) - { - uint32_t sz = 0u; - hsa_executable_symbol_get_info( - x, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &sz); - - string r(sz, '\0'); - hsa_executable_symbol_get_info( - x, HSA_EXECUTABLE_SYMBOL_INFO_NAME, &r.front()); - - return r; - } - - inline - uint32_t private_size(hsa_executable_symbol_t x) - { - uint32_t r = 0u; - hsa_executable_symbol_get_info( - x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &r); - - return r; - } - - inline - hsa_symbol_kind_t type(hsa_executable_symbol_t x) - { - hsa_symbol_kind_t r = {}; - hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &r); - - return r; - } - const unordered_map>& kernels() { static unordered_map> r; @@ -384,42 +321,43 @@ namespace } void load_code_object_and_freeze_executable( - istream& file, hsa_agent_t agent, hsa_executable_t executable) + const string& file, hsa_agent_t agent, hsa_executable_t executable) { // TODO: the following sequence is inefficient, should be refactored // into a single load of the file and subsequent ELFIO // processing. static const auto cor_deleter = [](hsa_code_object_reader_t* p) { - hsa_code_object_reader_destroy(*p); + if (p) { + hsa_code_object_reader_destroy(*p); + delete p; + } }; using RAII_code_reader = unique_ptr< hsa_code_object_reader_t, decltype(cor_deleter)>; - file.seekg(0); + if (!file.empty()) { + RAII_code_reader tmp{new hsa_code_object_reader_t, cor_deleter}; + hsa_code_object_reader_create_from_memory( + file.data(), file.size(), tmp.get()); - vector blob{ - istreambuf_iterator{file}, istreambuf_iterator{}}; - RAII_code_reader tmp{new hsa_code_object_reader_t, cor_deleter}; - hsa_code_object_reader_create_from_memory( - blob.data(), blob.size(), tmp.get()); + hsa_executable_load_agent_code_object( + executable, agent, *tmp, nullptr, nullptr); - hsa_executable_load_agent_code_object( - executable, agent, *tmp, nullptr, nullptr); + hsa_executable_freeze(executable, nullptr); - hsa_executable_freeze(executable, nullptr); + static vector code_readers; + static mutex mtx; - static vector code_readers; - static mutex mtx; - - lock_guard lck{mtx}; - code_readers.push_back(move(tmp)); + lock_guard lck{mtx}; + code_readers.push_back(move(tmp)); + } } } namespace hip_impl { const unordered_map>& executables() - { + { // TODO: This leaks the hsa_executable_ts, it should use RAII. static unordered_map> r; static once_flag f; @@ -449,8 +387,7 @@ namespace hip_impl // TODO: this is massively inefficient and only // meant for illustration. string blob_to_str{blob.cbegin(), blob.cend()}; - stringstream istr{blob_to_str}; - tmp = load_executable(tmp, a, istr); + tmp = load_executable(blob_to_str, tmp, a); if (tmp.handle) r[a].push_back(tmp); } @@ -535,33 +472,23 @@ namespace hip_impl } hsa_executable_t load_executable( - hsa_executable_t executable, hsa_agent_t agent, istream& file) + const string& file, hsa_executable_t executable, hsa_agent_t agent) { elfio reader; - if (!reader.load(file)) { - return hsa_executable_t{}; - } - else { - // TODO: this may benefit from caching as well. - elfio self_reader; - self_reader.load("/proc/self/exe"); + stringstream tmp{file}; - const auto symtab = - find_section_if(self_reader, [](const ELFIO::section* x) { - return x->get_type() == SHT_SYMTAB; - }); + if (!reader.load(tmp)) return hsa_executable_t{}; - const auto code_object_dynsym = - find_section_if(reader, [](const ELFIO::section* x) { + const auto code_object_dynsym = + find_section_if(reader, [](const ELFIO::section* x) { return x->get_type() == SHT_DYNSYM; - }); + }); - associate_code_object_symbols_with_host_allocation( - reader, self_reader, code_object_dynsym, symtab, agent, executable); + associate_code_object_symbols_with_host_allocation( + reader, code_object_dynsym, agent, executable); - load_code_object_and_freeze_executable(file, agent, executable); + load_code_object_and_freeze_executable(file, agent, executable); - return executable; - } + return executable; } } // Namespace hip_impl. \ No newline at end of file