From 28f87f7d2e761400e2d3b09a0b2b4f67b1080656 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 1 Nov 2017 15:09:59 +0000 Subject: [PATCH 01/27] This switches HIP from its currently convoluted macro + pfe based dispatch mechanism to a more natural one partially based on the existing module API. The basic idea is that HCC will always correctly emit __global__ functions: as empty-bodied stubs, on host, and as kernels, on device. It then becomes trivial to obtain the mangled name on host, at dispatch, from the function's address, and then to use the mangled name to retrieve the kernel. This should address all problems stemming from serialisation, dubious mismatches due to the manufactured functor, macro-isms et al. It also immediately enables support for generalised globals as a consequence of that being available in the module API. Finally, it will make debug much easier, since the actual names of the __global__ functions will automatically be used in traces etc. One detail is that due to how dispatch works now (hipLaunchKernel and hipLaunchKernelGGL are themselves variadic function templates which deduce the function type of the callee), in certain cases it may be necesssary to insert explicit casts to ensure that the variadic argument list selects a viable overload - this can be observed in some unit tests. Eventually we may be able to remove this limitation, but for now it does not appear terribly onerous. The code is not extremely HIPpie, nor is it fully optimised, but rather is intended as a starting point for the HIP team to make its own. --- .../hip/hcc_detail/code_object_bundle.hpp | 134 +++ .../hip/hcc_detail/grid_launch_GGL.hpp | 1059 ++--------------- hipamd/include/hip/hcc_detail/hip_runtime.h | 4 +- hipamd/include/hip/hcc_detail/host_defines.h | 3 +- .../include/hip/hcc_detail/program_state.hpp | 60 + hipamd/src/code_object_bundle.cpp | 39 + hipamd/src/grid_launch.cpp | 142 ++- hipamd/src/hip_hcc_internal.h | 2 +- hipamd/src/hip_memory.cpp | 20 +- hipamd/src/hip_module.cpp | 197 +-- hipamd/src/program_state.cpp | 498 ++++++++ hipamd/tests/src/context/hipMemsetD8.cpp | 3 +- hipamd/tests/src/deviceLib/hipTestDevice.cpp | 141 ++- .../src/deviceLib/hipTestDeviceDouble.cpp | 124 +- hipamd/tests/src/deviceLib/hip_test_ldg.cpp | 45 +- .../src/experimental/xcompile/hipxxKer.cpp | 10 +- .../src/kernel/hipLanguageExtensions.cpp | 4 +- hipamd/tests/src/kernel/hipTestMemKernel.cpp | 45 +- .../src/runtimeApi/event/hipEventRecord.cpp | 11 +- .../src/runtimeApi/event/record_event.cpp | 31 +- .../tests/src/runtimeApi/memory/hipMemcpy.cpp | 77 +- .../runtimeApi/memory/hipMemcpy_simple.cpp | 11 +- .../multiThread/hipMultiThreadStreams1.cpp | 15 +- .../src/runtimeApi/stream/hipNullStream.cpp | 46 +- .../src/runtimeApi/stream/hipStreamSync2.cpp | 39 +- .../runtimeApi/stream/hipStreamWaitEvent.cpp | 46 +- hipamd/tests/src/stress/hipStressKernel.cpp | 3 + 27 files changed, 1457 insertions(+), 1352 deletions(-) create mode 100644 hipamd/include/hip/hcc_detail/code_object_bundle.hpp create mode 100644 hipamd/include/hip/hcc_detail/program_state.hpp create mode 100644 hipamd/src/code_object_bundle.cpp create mode 100644 hipamd/src/program_state.cpp diff --git a/hipamd/include/hip/hcc_detail/code_object_bundle.hpp b/hipamd/include/hip/hcc_detail/code_object_bundle.hpp new file mode 100644 index 0000000000..080132c561 --- /dev/null +++ b/hipamd/include/hip/hcc_detail/code_object_bundle.hpp @@ -0,0 +1,134 @@ +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace hip_impl +{ + hsa_isa_t triple_to_hsa_isa(const std::string& triple); + + struct Bundled_code { + union { + struct { + std::uint64_t offset; + std::uint64_t bundle_sz; + std::uint64_t triple_sz; + }; + std::uint8_t cbuf[ + sizeof(offset) + sizeof(bundle_sz) + sizeof(triple_sz)]; + }; + std::string triple; + std::vector blob; + }; + + class Bundled_code_header { + // DATA - STATICS + static constexpr const char magic_string_[] = + "__CLANG_OFFLOAD_BUNDLE__"; + static constexpr auto magic_string_sz_ = sizeof(magic_string_) - 1; + + // DATA + union { + struct { + std::uint8_t bundler_magic_string_[magic_string_sz_]; + std::uint64_t bundle_cnt_; + }; + std::uint8_t cbuf_[ + sizeof(bundler_magic_string_) + sizeof(bundle_cnt_)]; + }; + std::vector bundles_; + + // FRIENDS - MANIPULATORS + template + friend + inline + bool read( + RandomAccessIterator f, + RandomAccessIterator l, + Bundled_code_header& x) + { + std::copy_n(f, sizeof(x.cbuf_), x.cbuf_); + + if (valid(x)) { + x.bundles_.resize(x.bundle_cnt_); + + auto it = f + sizeof(x.cbuf_); + for (auto&& y : x.bundles_) { + std::copy_n(it, sizeof(y.cbuf), y.cbuf); + it += sizeof(y.cbuf); + + y.triple.insert(y.triple.cend(), it, it + y.triple_sz); + + std::copy_n( + f + y.offset, y.bundle_sz, std::back_inserter(y.blob)); + + it += y.triple_sz; + } + + return true; + } + + return false; + } + friend + inline + bool read(const std::vector& blob, Bundled_code_header& x) + { + return read(blob.cbegin(), blob.cend(), x); + } + friend + inline + bool read(std::istream& is, Bundled_code_header& x) + { + return read(std::vector{ + std::istreambuf_iterator{is}, + std::istreambuf_iterator{}}, + x); + } + + // FRIENDS - ACCESSORS + friend + inline + bool valid(const Bundled_code_header& x) + { + return std::equal( + x.bundler_magic_string_, + x.bundler_magic_string_ + magic_string_sz_, + x.magic_string_); + } + friend + inline + const std::vector& bundles(const Bundled_code_header& x) + { + return x.bundles_; + } + public: + // CREATORS + Bundled_code_header() = default; + template + Bundled_code_header(RandomAccessIterator f, RandomAccessIterator l); + explicit + Bundled_code_header(const std::vector& blob); + Bundled_code_header(const Bundled_code_header&) = default; + Bundled_code_header(Bundled_code_header&&) = default; + ~Bundled_code_header() = default; + + // MANIPULATORS + Bundled_code_header& operator=(const Bundled_code_header&) = default; + Bundled_code_header& operator=(Bundled_code_header&&) = default; + }; + + // CREATORS + template + Bundled_code_header::Bundled_code_header(I f, I l) : Bundled_code_header{} + { + read(f, l, *this); + } +} // Namespace hip_impl. \ No newline at end of file diff --git a/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp b/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp index 4c632f9d68..e3fa3331ac 100644 --- a/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp +++ b/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp @@ -24,984 +24,139 @@ THE SOFTWARE. #if GENERIC_GRID_LAUNCH == 1 +#include "code_object_bundle.hpp" #include "concepts.hpp" #include "helpers.hpp" +#include "program_state.hpp" #include "hc.hpp" #include "hip/hip_hcc.h" #include "hip_runtime.h" +#include +#include #include #include +#include #include +#include +#include #include +#include #include +#include namespace hip_impl { - namespace + template< + typename T, + typename std::enable_if{}>::type* = nullptr> + inline + T round_up_to_next_multiple_nonnegative(T x, T y) { - struct New_grid_launch_tag {}; - struct Old_grid_launch_tag {}; - - template - class RAII_guard { - D dtor_; - public: - RAII_guard() = default; - - RAII_guard(const C& ctor, D dtor) : dtor_{std::move(dtor)} - { - ctor(); - } - - RAII_guard(const RAII_guard&) = default; - RAII_guard(RAII_guard&&) = default; - - RAII_guard& operator=(const RAII_guard&) = default; - RAII_guard& operator=(RAII_guard&&) = default; - - ~RAII_guard() { dtor_(); } - }; - - template - RAII_guard make_RAII_guard(const C& ctor, D dtor) - { - return RAII_guard{ctor, std::move(dtor)}; - } - - template - using is_new_grid_launch_t = typename std::conditional< - is_callable{}, - New_grid_launch_tag, - Old_grid_launch_tag>::type; + T tmp = x + y - 1; + return tmp - tmp % y; } - // TODO: - dispatch rank should be derived from the domain dimensions passed - // in, and not always assumed to be 3; - - template - requires(Domain == {Ts...}) inline - void grid_launch_hip_impl_( - New_grid_launch_tag, - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - const hc::accelerator_view& acc_v, - K k) + std::vector make_kernarg() { - const auto d = hc::extent<3>{ - num_blocks.z * dim_blocks.z, - num_blocks.y * dim_blocks.y, - num_blocks.x * dim_blocks.x}.tile_with_dynamic( - dim_blocks.z, - dim_blocks.y, - dim_blocks.x, - group_mem_bytes); - - try { - hc::parallel_for_each(acc_v, d, k); - } - catch (std::exception& ex) { - std::cerr << "Failed in " << __func__ << ", with exception: " - << ex.what() << std::endl; - throw; - } + return {}; } - // TODO: these are workarounds, they should be removed. - - hc::accelerator_view lock_stream_hip_(hipStream_t&, void*&); - void print_prelaunch_trace_(const char*, dim3, dim3, int, hipStream_t); - void unlock_stream_hip_( - hipStream_t, void*, const char*, hc::accelerator_view*); - - template - requires(Domain == {Ts...}) inline - void grid_launch_hip_impl_( - New_grid_launch_tag, - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, + std::vector make_kernarg(std::vector kernarg) + { + return kernarg; + } + + template + inline + std::vector make_kernarg(std::vector kernarg, T x) + { + kernarg.resize( + round_up_to_next_multiple_nonnegative(kernarg.size(), alignof(T)) + + sizeof(T)); + + new (kernarg.data() + kernarg.size() - sizeof(T)) T{std::move(x)}; + + return kernarg; + } + + template + inline + std::vector make_kernarg( + std::vector kernarg, T x, Ts... xs) + { + return make_kernarg( + make_kernarg(std::move(kernarg), std::move(x)), std::move(xs)...); + } + + template + inline + std::vector make_kernarg(Ts... xs) + { + std::vector kernarg; + kernarg.reserve(sizeof(std::tuple)); + + return make_kernarg(std::move(kernarg), std::move(xs)...); + } + + void hipLaunchKernelGGLImpl( + std::uintptr_t function_address, + const dim3& numBlocks, + const dim3& dimBlocks, + std::uint32_t sharedMemBytes, hipStream_t stream, - const char* kernel_name, - K k) - { - void* lck_stream = nullptr; - auto acc_v = lock_stream_hip_(stream, lck_stream); - auto stream_guard = make_RAII_guard( - std::bind( - print_prelaunch_trace_, - kernel_name, - num_blocks, - dim_blocks, - group_mem_bytes, - stream), - std::bind( - unlock_stream_hip_, stream, lck_stream, kernel_name, &acc_v)); + void** kernarg); +} // Namespace hip_impl. - try { - grid_launch_hip_impl_( - New_grid_launch_tag{}, - std::move(num_blocks), - std::move(dim_blocks), - group_mem_bytes, - acc_v, - std::move(k)); - } - catch (std::exception& ex) { - std::cerr << "Failed in " << __func__ << ", with exception: " - << ex.what() << std::endl; - throw; - } - } +template +inline +void hipLaunchKernelGGL( + F kernel, + const dim3& numBlocks, + const dim3& dimBlocks, + std::uint32_t sharedMemBytes, + hipStream_t stream, + Args... args) +{ + auto kernarg = hip_impl::make_kernarg(std::move(args)...); + std::size_t kernarg_size = kernarg.size(); - template - requires(Domain == {hipLaunchParm, Ts...}) - inline - void grid_launch_hip_impl_( - Old_grid_launch_tag, - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream, - K k) - { - grid_launch_hip_impl_( - New_grid_launch_tag{}, - std::move(num_blocks), - std::move(dim_blocks), - group_mem_bytes, - std::move(stream), - std::move(k)); - } + void* config[] = { + HIP_LAUNCH_PARAM_BUFFER_POINTER, kernarg.data(),//&kernarg, + HIP_LAUNCH_PARAM_BUFFER_SIZE, &kernarg_size, + HIP_LAUNCH_PARAM_END + }; - template - requires(Domain == {hipLaunchParm, Ts...}) - inline - void grid_launch_hip_impl_( - Old_grid_launch_tag, - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream, - const char* kernel_name, - K k) - { - grid_launch_hip_impl_( - New_grid_launch_tag{}, - std::move(num_blocks), - std::move(dim_blocks), - group_mem_bytes, - std::move(stream), - kernel_name, - std::move(k)); - } - - template - requires(Domain == {Ts...}) - inline - std::enable_if_t::value> grid_launch_hip_( - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream, - const char* kernel_name, - K k) - { - grid_launch_hip_impl_( - is_new_grid_launch_t{}, - std::move(num_blocks), - std::move(dim_blocks), - group_mem_bytes, - std::move(stream), - kernel_name, - std::move(k)); - } - - template - requires(Domain == {Ts...}) - inline - std::enable_if_t::value> grid_launch_hip_( - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream, - K k) - { - grid_launch_hip_impl_( - is_new_grid_launch_t{}, - std::move(num_blocks), - std::move(dim_blocks), - group_mem_bytes, - std::move(stream), - std::move(k)); - } - - // TODO: these are temporary and purposefully noisy and disruptive. - #define make_kernel_name_hip(k, n)\ - HIP_kernel_functor_name_begin ## _ ## k ## _ ## \ - HIP_kernel_functor_name_end ## _ ## n - - #define make_kernel_functor_hip_30(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ - p24, p25, p26, p27)\ - struct make_kernel_name_hip(function_name, 28) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - std::decay_t _p21_;\ - std::decay_t _p22_;\ - std::decay_t _p23_;\ - std::decay_t _p24_;\ - std::decay_t _p25_;\ - std::decay_t _p26_;\ - std::decay_t _p27_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_,\ - _p26_, _p27_);\ - }\ - } - #define make_kernel_functor_hip_29(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ - p24, p25, p26)\ - struct make_kernel_name_hip(function_name, 27) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - std::decay_t _p21_;\ - std::decay_t _p22_;\ - std::decay_t _p23_;\ - std::decay_t _p24_;\ - std::decay_t _p25_;\ - std::decay_t _p26_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_,\ - _p26_);\ - }\ - } - #define make_kernel_functor_hip_28(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ - p24, p25)\ - struct make_kernel_name_hip(function_name, 26) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - std::decay_t _p21_;\ - std::decay_t _p22_;\ - std::decay_t _p23_;\ - std::decay_t _p24_;\ - std::decay_t _p25_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_);\ - }\ - } - #define make_kernel_functor_hip_27(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ - p24)\ - struct make_kernel_name_hip(function_name, 25) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - std::decay_t _p21_;\ - std::decay_t _p22_;\ - std::decay_t _p23_;\ - std::decay_t _p24_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_);\ - }\ - } - #define make_kernel_functor_hip_26(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23)\ - struct make_kernel_name_hip(function_name, 24) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - std::decay_t _p21_;\ - std::decay_t _p22_;\ - std::decay_t _p23_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_, _p23_);\ - }\ - } - #define make_kernel_functor_hip_25(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22)\ - struct make_kernel_name_hip(function_name, 23) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - std::decay_t _p21_;\ - std::decay_t _p22_;\ - __attribute__((used, flatten))\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_);\ - }\ - } - #define make_kernel_functor_hip_24(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21)\ - struct make_kernel_name_hip(function_name, 22) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - std::decay_t _p21_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_);\ - }\ - } - #define make_kernel_functor_hip_23(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20)\ - struct make_kernel_name_hip(function_name, 21) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - std::decay_t _p20_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_);\ - }\ - } - #define make_kernel_functor_hip_22(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18, p19)\ - struct make_kernel_name_hip(function_name, 20) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - std::decay_t _p19_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_);\ - }\ - } - #define make_kernel_functor_hip_21(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17, p18)\ - struct make_kernel_name_hip(function_name, 19) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - std::decay_t _p18_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_);\ - }\ - } - #define make_kernel_functor_hip_20(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16, p17)\ - struct make_kernel_name_hip(function_name, 18) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - std::decay_t _p17_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_);\ - }\ - } - #define make_kernel_functor_hip_19(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15, p16)\ - struct make_kernel_name_hip(function_name, 17) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - std::decay_t _p16_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_);\ - }\ - } - #define make_kernel_functor_hip_18(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14, p15)\ - struct make_kernel_name_hip(function_name, 16) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - std::decay_t _p15_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_, _p15_);\ - }\ - } - #define make_kernel_functor_hip_17(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13, p14)\ - struct make_kernel_name_hip(function_name, 15) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - std::decay_t _p14_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_, _p14_);\ - }\ - } - #define make_kernel_functor_hip_16(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12, p13)\ - struct make_kernel_name_hip(function_name, 14) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - std::decay_t _p13_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_, _p13_);\ - }\ - } - #define make_kernel_functor_hip_15(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11, p12)\ - struct make_kernel_name_hip(function_name, 13) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - std::decay_t _p12_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_, _p12_);\ - }\ - } - #define make_kernel_functor_hip_14(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ - p10, p11)\ - struct make_kernel_name_hip(function_name, 12) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - std::decay_t _p11_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_, _p11_);\ - }\ - } - #define make_kernel_functor_hip_13(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10)\ - struct make_kernel_name_hip(function_name, 11) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - std::decay_t _p10_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_);\ - }\ - } - #define make_kernel_functor_hip_12(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9)\ - struct make_kernel_name_hip(function_name, 10) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - std::decay_t _p9_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_);\ - }\ - } - #define make_kernel_functor_hip_11(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8)\ - struct make_kernel_name_hip(function_name, 9) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - std::decay_t _p8_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_);\ - }\ - } - #define make_kernel_functor_hip_10(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7)\ - struct make_kernel_name_hip(function_name, 8) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - std::decay_t _p7_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_);\ - }\ - } - #define make_kernel_functor_hip_9(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6)\ - struct make_kernel_name_hip(function_name, 7) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - std::decay_t _p6_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_);\ - }\ - } - #define make_kernel_functor_hip_8(\ - function_name, kernel_name, p0, p1, p2, p3, p4, p5)\ - struct make_kernel_name_hip(function_name, 6) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - std::decay_t _p5_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_);\ - }\ - } - #define make_kernel_functor_hip_7(\ - function_name, kernel_name, p0, p1, p2, p3, p4)\ - struct make_kernel_name_hip(function_name, 5) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - std::decay_t _p4_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_);\ - }\ - } - #define make_kernel_functor_hip_6(function_name, kernel_name, p0, p1, p2, p3)\ - struct make_kernel_name_hip(function_name, 4) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - std::decay_t _p3_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_, _p1_, _p2_, _p3_);\ - }\ - } - #define make_kernel_functor_hip_5(function_name, kernel_name, p0, p1, p2)\ - struct make_kernel_name_hip(function_name, 3) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - std::decay_t _p2_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_, _p1_, _p2_);\ - }\ - } - #define make_kernel_functor_hip_4(function_name, kernel_name, p0, p1)\ - struct make_kernel_name_hip(function_name, 2) {\ - std::decay_t _p0_;\ - std::decay_t _p1_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_, _p1_);\ - }\ - } - #define fofo(f, n) kernel_prefix_hip ## f ## kernel_suffix_hip ## n - #define make_kernel_functor_hip_3(function_name, kernel_name, p0)\ - struct make_kernel_name_hip(function_name, 1) {\ - std::decay_t _p0_;\ - void operator()(const hc::tiled_index<3>&) const [[hc]]\ - {\ - kernel_name(_p0_);\ - }\ - } - #define make_kernel_functor_hip_2(function_name, kernel_name)\ - struct make_kernel_name_hip(function_name, 0) {\ - void operator()(const hc::tiled_index<3>&) [[hc]]\ - {\ - return kernel_name(hipLaunchParm{});\ - }\ - } - #define make_kernel_functor_hip_1(...) - #define make_kernel_functor_hip_0(...) - #define make_kernel_functor_hip_(...)\ - overload_macro_hip_(make_kernel_functor_hip_, __VA_ARGS__) - - - #define hipLaunchNamedKernelGGL(\ - function_name,\ - kernel_name,\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - ...)\ - do {\ - make_kernel_functor_hip_(function_name, kernel_name, __VA_ARGS__)\ - hip_kernel_functor_impl_{__VA_ARGS__};\ - hip_impl::grid_launch_hip_(\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - #kernel_name,\ - hip_kernel_functor_impl_);\ - } while(0) - - #define hipLaunchKernelGGL(\ - kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)\ - do {\ - hipLaunchNamedKernelGGL(\ - unnamed,\ - kernel_name,\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - ##__VA_ARGS__);\ - } while (0) - - #define hipLaunchKernel(\ - kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)\ - do {\ - hipLaunchKernelGGL(\ - kernel_name,\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - hipLaunchParm{},\ - ##__VA_ARGS__);\ - } while(0) + hip_impl::hipLaunchKernelGGLImpl( + reinterpret_cast(kernel), + numBlocks, + dimBlocks, + sharedMemBytes, + stream, + &config[0]); } + +template +inline +void hipLaunchKernel( + F kernel, + const dim3& numBlocks, + const dim3& dimBlocks, + std::uint32_t groupMemBytes, + hipStream_t stream, + Args... args) +{ + hipLaunchKernelGGL( + kernel, + numBlocks, + dimBlocks, + groupMemBytes, + stream, + hipLaunchParm{}, + std::move(args)...); +} + #endif //GENERIC_GRID_LAUNCH diff --git a/hipamd/include/hip/hcc_detail/hip_runtime.h b/hipamd/include/hip/hcc_detail/hip_runtime.h index 370ac2abbb..d3211ed3f5 100644 --- a/hipamd/include/hip/hcc_detail/hip_runtime.h +++ b/hipamd/include/hip/hcc_detail/hip_runtime.h @@ -53,7 +53,7 @@ THE SOFTWARE. // define HIP_ENABLE_PRINTF to enable printf #ifdef HIP_ENABLE_PRINTF #define HCC_ENABLE_ACCELERATOR_PRINTF 1 -#endif +#endif //--- // Remainder of this file only compiles with HCC @@ -481,7 +481,7 @@ do {\ type* var = \ (type*)__get_dynamicgroupbaseptr(); \ -#define HIP_DYNAMIC_SHARED_ATTRIBUTE +#define HIP_DYNAMIC_SHARED_ATTRIBUTE diff --git a/hipamd/include/hip/hcc_detail/host_defines.h b/hipamd/include/hip/hcc_detail/host_defines.h index b2e7ac2617..56cfa0cc0f 100644 --- a/hipamd/include/hip/hcc_detail/host_defines.h +++ b/hipamd/include/hip/hcc_detail/host_defines.h @@ -44,7 +44,8 @@ THE SOFTWARE. #if GENERIC_GRID_LAUNCH == 0 #define __global__ __attribute__((hc_grid_launch)) __attribute__((used)) #else -#define __global__ __attribute__((annotate("hip__global__"), hc, used, weak)) +#define __global__ \ + __attribute__((annotate("__HIP_global_function__"), cpu, hc, used)) #endif //GENERIC_GRID_LAUNCH #define __noinline__ __attribute__((noinline)) diff --git a/hipamd/include/hip/hcc_detail/program_state.hpp b/hipamd/include/hip/hcc_detail/program_state.hpp new file mode 100644 index 0000000000..03701725eb --- /dev/null +++ b/hipamd/include/hip/hcc_detail/program_state.hpp @@ -0,0 +1,60 @@ +/* +Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include + +struct ihipModuleSymbol_t; +using hipFunction_t = ihipModuleSymbol_t*; + +namespace hip_impl +{ + struct Kernel_descriptor { + std::uint64_t kernel_object_; + std::uint32_t group_size_; + std::uint32_t private_size_; + std::string name_; + + operator hipFunction_t() const + { // TODO: this is awful and only meant for illustration. + return reinterpret_cast( + const_cast(this)); + } + }; + + const std::unordered_map< + std::uintptr_t, + std::vector>>& functions(); + const std::unordered_map& function_names(); + + hsa_executable_t load_executable( + hsa_executable_t executable, hsa_agent_t agent, std::istream& file); +} // Namespace hip_impl. \ No newline at end of file diff --git a/hipamd/src/code_object_bundle.cpp b/hipamd/src/code_object_bundle.cpp new file mode 100644 index 0000000000..d7d2cd1e10 --- /dev/null +++ b/hipamd/src/code_object_bundle.cpp @@ -0,0 +1,39 @@ +#include "../include/hip/hcc_detail/code_object_bundle.hpp" + +#include + +#include +#include +#include + +hsa_isa_t hip_impl::triple_to_hsa_isa(const std::string& triple) +{ + static constexpr const char prefix[] = "hcc-amdgcn--amdhsa-gfx"; + static constexpr std::size_t prefix_sz = sizeof(prefix) - 1; + + hsa_isa_t r = {}; + + auto idx = triple.find(prefix); + + if (idx != std::string::npos) { + idx += prefix_sz; + std::string tmp = "AMD:AMDGPU"; + while (idx != triple.size()) { + tmp.push_back(':'); + tmp.push_back(triple[idx++]); + } + + hsa_isa_from_name(tmp.c_str(), &r); + } + + return r; +} + +// DATA - STATICS +constexpr const char hip_impl::Bundled_code_header::magic_string_[]; + +// CREATORS +hip_impl::Bundled_code_header::Bundled_code_header( + const std::vector& x) + : Bundled_code_header{x.cbegin(), x.cend()} +{} \ No newline at end of file diff --git a/hipamd/src/grid_launch.cpp b/hipamd/src/grid_launch.cpp index fd5c2a1573..4a26f66c8c 100644 --- a/hipamd/src/grid_launch.cpp +++ b/hipamd/src/grid_launch.cpp @@ -21,76 +21,118 @@ THE SOFTWARE. */ #include "hip/hcc_detail/grid_launch_GGL.hpp" +#include "hip/hcc_detail/program_state.hpp" + +#include "hip/hip_runtime_api.h" // Internal header, do not percolate upwards. #include "hip_hcc_internal.h" #include "hc.hpp" #include "trace_helper.h" +#include +#include +#include +#include + #include -#include + +using namespace hc; +using namespace std; namespace hip_impl { - hc::accelerator_view lock_stream_hip_( - hipStream_t& stream, void*& locked_stream) - { // This allocated but does not take ownership of locked_stream. If it is - // not deleted elsewhere it will leak. - using L = decltype(stream->lockopen_preKernelCommand()); - - HIP_INIT(); - - stream = ihipSyncAndResolveStream(stream); - locked_stream = new L{stream->lockopen_preKernelCommand()}; - return (*static_cast(locked_stream))->_av; - } - - void print_prelaunch_trace_( - const char* kernel_name, - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream) + namespace { - if ((HIP_TRACE_API & (1 << TRACE_KCMD)) || - HIP_PROFILE_API || - (COMPILE_HIP_DB && (HIP_TRACE_API & (1<second; + } + + inline + string name(hsa_agent_t agent) + { + char n[64] = {}; + hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, n); + + return string{n}; + } + + inline + hsa_agent_t target_agent(hipStream_t stream) + { + if (stream) { + return *static_cast( + stream->locked_getAv()->get_hsa_agent()); + } + else if ( + ihipGetTlsDefaultCtx() && ihipGetTlsDefaultCtx()->getDevice()) { + return ihipGetDevice( + ihipGetTlsDefaultCtx()->getDevice()->_deviceId)->_hsaAgent; + } + else { + return *static_cast( + accelerator{}.get_default_view().get_hsa_agent()); } } } - void unlock_stream_hip_( + void hipLaunchKernelGGLImpl( + uintptr_t function_address, + const dim3& numBlocks, + const dim3& dimBlocks, + uint32_t sharedMemBytes, hipStream_t stream, - void* locked_stream, - const char* kernel_name, - hc::accelerator_view* acc_v) - { // Precondition: acc_v is the accelerator_view associated with stream - // which is guarded by locked_stream; - // locked_stream is deletable. - using L = decltype(stream->lockopen_preKernelCommand()); + void** kernarg) + { + const auto it0 = functions().find(function_address); - stream->lockclose_postKernelCommand(kernel_name, acc_v); + if (it0 == functions().cend()) { + throw runtime_error{ + "No device code available for function: " + + name(function_address) + }; + } - delete static_cast(locked_stream); - locked_stream = nullptr; + auto agent = target_agent(stream); + + const auto it1 = find_if( + it0->second.cbegin(), + it0->second.cend(), + [=](const pair& x) { + return x.first.handle == agent.handle; + }); + + if (it1 == it0->second.cend()) { + throw runtime_error{ + "No code available for function: " + name(function_address) + + ", for agent: " + name(agent) + }; + } + + for (auto&& agent_kernel : it0->second) { + if (agent.handle == agent_kernel.first.handle) { + hipModuleLaunchKernel( + agent_kernel.second, + numBlocks.x, + numBlocks.y, + numBlocks.z, + dimBlocks.x, + dimBlocks.y, + dimBlocks.z, + sharedMemBytes, + stream, + nullptr, + kernarg); + } + } } } diff --git a/hipamd/src/hip_hcc_internal.h b/hipamd/src/hip_hcc_internal.h index 197cd35bfa..503bebcd6a 100644 --- a/hipamd/src/hip_hcc_internal.h +++ b/hipamd/src/hip_hcc_internal.h @@ -233,7 +233,7 @@ static const DbName dbName [] = #if COMPILE_HIP_DB #define tprintf(trace_level, ...) {\ if (HIP_DB & (1<<(trace_level))) {\ - char msgStr[1000];\ + char msgStr[2000];\ snprintf(msgStr, 2000, __VA_ARGS__);\ fprintf (stderr, " %ship-%s tid:%d:%s%s", dbName[trace_level]._color, dbName[trace_level]._shortName, tls_tidInfo.tid(), msgStr, KNRM); \ }\ diff --git a/hipamd/src/hip_memory.cpp b/hipamd/src/hip_memory.cpp index a8324c5729..96fc25c27d 100644 --- a/hipamd/src/hip_memory.cpp +++ b/hipamd/src/hip_memory.cpp @@ -65,7 +65,7 @@ int sharePtr(void *ptr, ihipCtx_t *ctx, bool shareWithAll, unsigned hipFlags) if (shareWithAll) { hsa_status_t s = hsa_amd_agents_allow_access(g_deviceCnt+1, g_allAgents, NULL, ptr); - tprintf (DB_MEM, " allow access to CPU + all %d GPUs (shareWithAll)\n", g_deviceCnt); + tprintf (DB_MEM, " allow access to CPU + all %d GPUs (shareWithAll)\n", g_deviceCnt); if (s != HSA_STATUS_SUCCESS) { ret = -1; } @@ -122,7 +122,7 @@ void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, bool if (HIP_INIT_ALLOC != -1) { // TODO , dont' call HIP API directly here: hipMemset(ptr, HIP_INIT_ALLOC, sizeBytes); - } + } if (ptr != nullptr) { int r = sharePtr(ptr, ctx, shareWithAll, hipFlags); @@ -251,7 +251,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) hip_status = hipErrorMemoryAllocation; } - } + } return ihipLogStatus(hip_status); @@ -284,10 +284,10 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) } - const unsigned supportedFlags = hipHostMallocPortable - | hipHostMallocMapped - | hipHostMallocWriteCombined - | hipHostMallocCoherent + const unsigned supportedFlags = hipHostMallocPortable + | hipHostMallocMapped + | hipHostMallocWriteCombined + | hipHostMallocCoherent | hipHostMallocNonCoherent; @@ -300,7 +300,7 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) hip_status = hipErrorInvalidValue; } else { auto device = ctx->getWriteableDevice(); - + unsigned amFlags = 0; if (flags & hipHostMallocCoherent) { amFlags = amHostCoherent; @@ -581,7 +581,7 @@ hipError_t hipMalloc3DArray(hipArray_t *array, hsa_ext_image_data_info_t imageInfo; hsa_status_t status = hsa_ext_image_data_get_info(*agent, &imageDescriptor, permission, &imageInfo); size_t alignment = imageInfo.alignment <= allocGranularity ? 0 : imageInfo.alignment; - + *ptr = hip_internal::allocAndSharePtr("device_array", allocSize, ctx, false, am_flags, 0, alignment); if (size && (*ptr == NULL)) { @@ -1585,7 +1585,7 @@ hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* devPtr){ HIP_INIT_API ( handle, devPtr); hipError_t hipStatus = hipSuccess; // Get the size of allocated pointer - size_t psize; + size_t psize = 0u; hc::accelerator acc; if((handle == NULL) || (devPtr == NULL)) { hipStatus = hipErrorInvalidResourceHandle; diff --git a/hipamd/src/hip_module.cpp b/hipamd/src/hip_module.cpp index e9e572af9b..a77ee48a53 100644 --- a/hipamd/src/hip_module.cpp +++ b/hipamd/src/hip_module.cpp @@ -119,15 +119,18 @@ namespace hipdrv { uint64_t PrintSymbolSizes(const void *emi, const char *name){ using namespace ELFIO; - const Elf64_Ehdr *ehdr = (const Elf64_Ehdr*)emi; + const ELFIO::Elf64_Ehdr *ehdr = (const ELFIO::Elf64_Ehdr*)emi; if(NULL == ehdr || EV_CURRENT != ehdr->e_version){} - const Elf64_Shdr * shdr = (const Elf64_Shdr*)((char*)emi + ehdr->e_shoff); + const ELFIO::Elf64_Shdr * shdr = + (const ELFIO::Elf64_Shdr*)((char*)emi + ehdr->e_shoff); for(uint16_t i=0;ie_shnum;++i){ if(shdr[i].sh_type == SHT_SYMTAB){ - const Elf64_Sym *syms = (const Elf64_Sym*)((char*)emi + shdr[i].sh_offset); + const ELFIO::Elf64_Sym *syms = + (const ELFIO::Elf64_Sym*)((char*)emi + shdr[i].sh_offset); assert(syms); uint64_t numSyms = shdr[i].sh_size/shdr[i].sh_entsize; - const char* strtab = (const char*)((char*)emi + shdr[shdr[i].sh_link].sh_offset); + const char* strtab = + (const char*)((char*)emi + shdr[shdr[i].sh_link].sh_offset); assert(strtab); for(uint64_t i=0;ie_shoff); + const ELFIO::Elf64_Ehdr *ehdr = (const ELFIO::Elf64_Ehdr*)emi; + const ELFIO::Elf64_Shdr *shdr = (const ELFIO::Elf64_Shdr*)((char*)emi + ehdr->e_shoff); uint64_t max_offset = ehdr->e_shoff; uint64_t total_size = max_offset + ehdr->e_shentsize * ehdr->e_shnum; @@ -164,156 +167,8 @@ uint64_t ElfSize(const void *emi){ return total_size; } -namespace -{ - template - inline - ELFIO::section* find_section_if(ELFIO::elfio& reader, P p) - { - using namespace std; - - const auto it = find_if( - reader.sections.begin(), reader.sections.end(), move(p)); - - return it != reader.sections.end() ? *it : nullptr; - } - - inline - std::vector copy_names_of_undefined_symbols( - const ELFIO::symbol_section_accessor& section) - { - using namespace ELFIO; - using namespace std; - - vector r; - - for (auto i = 0u; i != section.get_symbols_num(); ++i) { - // TODO: this is boyscout code, caching the temporaries - // may be of worth. - string name; - Elf64_Addr value = 0; - Elf_Xword size = 0; - Elf_Half sect_idx = 0; - uint8_t bind = 0; - uint8_t type = 0; - uint8_t other = 0; - - section.get_symbol( - i, name, value, size, bind, type, sect_idx, other); - - if (sect_idx == SHN_UNDEF && !name.empty()) { - r.push_back(std::move(name)); - } - } - - return r; - } - - inline - std::pair find_symbol_address( - const ELFIO::symbol_section_accessor& section, - const std::string& symbol_name) - { - using namespace ELFIO; - using namespace std; - - static constexpr pair r{0, 0}; - - for (auto i = 0u; i != section.get_symbols_num(); ++i) { - // TODO: this is boyscout code, caching the temporaries - // may be of worth. - string name; - Elf64_Addr value = 0; - Elf_Xword size = 0; - Elf_Half sect_idx = 0; - uint8_t bind = 0; - uint8_t type = 0; - uint8_t other = 0; - - section.get_symbol( - i, name, value, size, bind, type, sect_idx, other); - - if (name == symbol_name) return make_pair(value, size); - } - - return r; - } - - inline - void associate_code_object_symbols_with_host_allocation( - const ELFIO::elfio& reader, - const ELFIO::elfio& self_reader, - ELFIO::section* code_object_dynsym, - ELFIO::section* process_symtab, - hsa_agent_t agent, - hsa_executable_t executable) - { - using namespace ELFIO; - using namespace std; - - if (!code_object_dynsym || !process_symtab) return; - - const auto undefined_symbols = copy_names_of_undefined_symbols( - symbol_section_accessor{reader, code_object_dynsym}); - - for (auto&& x : undefined_symbols) { - const auto tmp = find_symbol_address( - symbol_section_accessor{self_reader, process_symtab}, x); - - assert(tmp.first); - - void* p = nullptr; - hsa_amd_memory_lock( - reinterpret_cast(tmp.first), tmp.second, &agent, 1, &p); - - hsa_executable_agent_global_variable_define( - executable, agent, x.c_str(), p); - - static vector< - unique_ptr> globals; - static mutex mtx; - - lock_guard lck{mtx}; - globals.emplace_back(p, hsa_amd_memory_unlock); - } - } - - inline - void load_code_object_and_freeze_executable( - const char* file, hsa_agent_t agent, hsa_executable_t executable) - { // TODO: the following sequence is inefficient, should be refactored - // into a single load of the file and subsequent ELFIO - // processing. - using namespace std; - - static const auto cor_deleter = [](hsa_code_object_reader_t* p) { - hsa_code_object_reader_destroy(*p); - }; - - using RAII_code_reader = unique_ptr< - hsa_code_object_reader_t, decltype(cor_deleter)>; - - unique_ptr cobj{fopen(file, "r"), fclose}; - RAII_code_reader tmp{new hsa_code_object_reader_t, cor_deleter}; - hsa_code_object_reader_create_from_file(fileno(cobj.get()), tmp.get()); - - hsa_executable_load_agent_code_object( - executable, agent, *tmp, nullptr, nullptr); - - hsa_executable_freeze(executable, nullptr); - - static vector code_readers; - static mutex mtx; - - lock_guard lck{mtx}; - code_readers.push_back(move(tmp)); - } -} - hipError_t hipModuleLoad(hipModule_t *module, const char *fname) { - using namespace ELFIO; - HIP_INIT_API(module, fname); hipError_t ret = hipSuccess; *module = new ihipModule_t; @@ -336,36 +191,14 @@ hipError_t hipModuleLoad(hipModule_t *module, const char *fname) nullptr, &(*module)->executable); - elfio reader; - if (!reader.load(fname)) { + std::ifstream file{fname}; + + if (!file.is_open()) { return ihipLogStatus(hipErrorFileNotFound); } - else { - // TODO: this may benefit from caching as well. - elfio self_reader; - self_reader.load("/proc/self/exe"); - - const auto symtab = - find_section_if(self_reader, [](const ELFIO::section* x) { - return x->get_type() == SHT_SYMTAB; - }); - - const auto code_object_dynsym = - find_section_if(reader, [](const ELFIO::section* x) { - return x->get_type() == SHT_DYNSYM; - }); - - associate_code_object_symbols_with_host_allocation( - reader, - self_reader, - code_object_dynsym, - symtab, - currentDevice->_hsaAgent, - (*module)->executable); - - load_code_object_and_freeze_executable( - fname, currentDevice->_hsaAgent, (*module)->executable); - } + (*module)->executable = hip_impl::load_executable( + (*module)->executable, currentDevice->_hsaAgent, file); + ret = (*module)->executable.handle ? hipSuccess : hipErrorUnknown; } return ihipLogStatus(ret); diff --git a/hipamd/src/program_state.cpp b/hipamd/src/program_state.cpp new file mode 100644 index 0000000000..be871a6e84 --- /dev/null +++ b/hipamd/src/program_state.cpp @@ -0,0 +1,498 @@ +#include "../include/hip/hcc_detail/program_state.hpp" + +#include "../include/hip/hcc_detail/code_object_bundle.hpp" + +#include "hip_hcc_internal.h" +#include "trace_helper.h" + +#include "elfio/elfio.hpp" + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace ELFIO; +using namespace hip_impl; +using namespace std; + +namespace std +{ + template<> + struct hash { + size_t operator()(hsa_agent_t x) const + { + return hash{}(x.handle); + } + }; + + template<> + struct hash { + size_t operator()(hsa_isa_t x) const + { + return hash{}(x.handle); + } + }; +} + +inline +constexpr +bool operator==(hsa_agent_t x, hsa_agent_t y) +{ + return x.handle == y.handle; +} + +inline +constexpr +bool operator==(hsa_isa_t x, hsa_isa_t y) +{ + return x.handle == y.handle; +} + +namespace +{ + vector copy_names_of_undefined_symbols( + const symbol_section_accessor& section) + { + vector r; + + for (auto i = 0u; i != section.get_symbols_num(); ++i) { + // TODO: this is boyscout code, caching the temporaries + // may be of worth. + string name; + Elf64_Addr value = 0; + Elf_Xword size = 0; + Elf_Half sect_idx = 0; + uint8_t bind = 0; + uint8_t type = 0; + uint8_t other = 0; + + section.get_symbol( + i, name, value, size, bind, type, sect_idx, other); + + if (sect_idx == SHN_UNDEF && !name.empty()) { + r.push_back(std::move(name)); + } + } + + return r; + } + + pair find_symbol_address( + const symbol_section_accessor& section, + const string& symbol_name) + { + static constexpr pair r{0, 0}; + + for (auto i = 0u; i != section.get_symbols_num(); ++i) { + // TODO: this is boyscout code, caching the temporaries + // may be of worth. + string name; + Elf64_Addr value = 0; + Elf_Xword size = 0; + Elf_Half sect_idx = 0; + uint8_t bind = 0; + uint8_t type = 0; + uint8_t other = 0; + + section.get_symbol( + i, name, value, size, bind, type, sect_idx, other); + + if (name == symbol_name) return make_pair(value, size); + } + + return r; + } + + void associate_code_object_symbols_with_host_allocation( + const elfio& reader, + const elfio& self_reader, + section* code_object_dynsym, + section* process_symtab, + hsa_agent_t agent, + hsa_executable_t executable) + { + if (!code_object_dynsym || !process_symtab) return; + + const auto undefined_symbols = copy_names_of_undefined_symbols( + symbol_section_accessor{reader, code_object_dynsym}); + + for (auto&& x : undefined_symbols) { + const auto tmp = find_symbol_address( + symbol_section_accessor{self_reader, process_symtab}, x); + + assert(tmp.first); + + void* p = nullptr; + hsa_amd_memory_lock( + reinterpret_cast(tmp.first), tmp.second, &agent, 1, &p); + + hsa_executable_agent_global_variable_define( + executable, agent, x.c_str(), p); + + static vector< + unique_ptr> globals; + static mutex mtx; + + lock_guard lck{mtx}; + globals.emplace_back(p, hsa_amd_memory_unlock); + } + } + + template + inline + section* find_section_if(elfio& reader, P p) + { + const auto it = find_if( + reader.sections.begin(), reader.sections.end(), std::move(p)); + + return it != reader.sections.end() ? *it : nullptr; + } + + vector code_object_blob_for_process() + { + static constexpr const char self[] = "/proc/self/exe"; + static constexpr const char kernel_section[] = ".kernel"; + + elfio reader; + + if (!reader.load(self)) { + throw runtime_error{"Failed to load ELF file for current process."}; + } + + auto kernels = find_section_if(reader, [](const section* x) { + return x->get_name() == kernel_section; + }); + + vector r; + if (kernels) { + r.insert( + r.end(), + kernels->get_data(), + kernels->get_data() + kernels->get_size()); + } + + return r; + } + + const unordered_map>>& code_object_blobs() + { + static unordered_map>> r; + static once_flag f; + + call_once(f, []() { + static vector> blobs{ + code_object_blob_for_process()}; + + dl_iterate_phdr([](dl_phdr_info* i, std::size_t, void*) { + elfio tmp; + if (tmp.load(i->dlpi_name)) { + const auto it = find_section_if(tmp, [](const section* x) { + return x->get_name() == ".kernel"; + }); + + if (it) blobs.emplace_back( + it->get_data(), it->get_data() + it->get_size()); + } + return 0; + }, nullptr); + + for (auto&& blob : blobs) { + Bundled_code_header tmp{blob}; + if (valid(tmp)) { + for (auto&& bundle : bundles(tmp)) { + r[triple_to_hsa_isa(bundle.triple)] + .push_back(bundle.blob); + } + } + } + }); + + return r; + } + + const unordered_map>& executables() + { + static unordered_map> r; + static once_flag f; + + call_once(f, []() { + static const auto accelerators = hc::accelerator::get_all(); + + for (auto&& acc : accelerators) { + auto agent = static_cast(acc.get_hsa_agent()); + + if (!agent) continue; + + hsa_agent_iterate_isas(*agent, [](hsa_isa_t x, void* pa) { + const auto it = code_object_blobs().find(x); + + if (it != code_object_blobs().cend()) { + hsa_agent_t a = *static_cast(pa); + + for (auto&& blob : it->second) { + hsa_executable_t tmp = {}; + + hsa_executable_create_alt( + HSA_PROFILE_FULL, + HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, + nullptr, + &tmp); + + // TODO: this is massively inefficient and only + // meant for illustration. + string blob_to_str{blob.cbegin(), blob.cend()}; + stringstream istr{blob_to_str}; + tmp = load_executable(tmp, a, istr); + + if (tmp.handle) r[a].push_back(tmp); + } + } + + return HSA_STATUS_SUCCESS; + }, agent); + } + }); + + cout << r.size() << endl; + return r; + } + + inline + hsa_agent_t agent(hsa_executable_symbol_t x) + { + hsa_agent_t r = {}; + hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_AGENT, &r); + + return r; + } + + inline + uint32_t group_size(hsa_executable_symbol_t x) + { + uint32_t r = 0u; + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &r); + + return r; + } + + inline + uint64_t kernel_object(hsa_executable_symbol_t x) + { + uint64_t r = 0u; + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &r); + + return r; + } + + inline + string name(hsa_executable_symbol_t x) + { + uint32_t sz = 0u; + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &sz); + + string r(sz, '\0'); + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_NAME, &r.front()); + + return r; + } + + inline + uint32_t private_size(hsa_executable_symbol_t x) + { + uint32_t r = 0u; + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &r); + + return r; + } + + inline + hsa_symbol_kind_t type(hsa_executable_symbol_t x) + { + hsa_symbol_kind_t r = {}; + hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &r); + + return r; + } + + const unordered_map>& kernels() + { + static unordered_map> r; + static once_flag f; + + call_once(f, []() { + static const auto copy_kernels = []( + hsa_executable_t, hsa_agent_t, hsa_executable_symbol_t s, void*) { + if (type(s) == HSA_SYMBOL_KIND_KERNEL) r[name(s)].push_back(s); + + return HSA_STATUS_SUCCESS; + }; + + for (auto&& agent_executables : executables()) { + for (auto&& executable : agent_executables.second) { + hsa_executable_iterate_agent_symbols( + executable, + agent_executables.first, + copy_kernels, + nullptr); + } + } + }); + + return r; + } + + void load_code_object_and_freeze_executable( + istream& file, hsa_agent_t agent, hsa_executable_t executable) + { // TODO: the following sequence is inefficient, should be refactored + // into a single load of the file and subsequent ELFIO + // processing. + static const auto cor_deleter = [](hsa_code_object_reader_t* p) { + hsa_code_object_reader_destroy(*p); + }; + + using RAII_code_reader = unique_ptr< + hsa_code_object_reader_t, decltype(cor_deleter)>; + + file.seekg(0); + + vector blob{ + istreambuf_iterator{file}, istreambuf_iterator{}}; + RAII_code_reader tmp{new hsa_code_object_reader_t, cor_deleter}; + hsa_code_object_reader_create_from_memory( + blob.data(), blob.size(), tmp.get()); + + hsa_executable_load_agent_code_object( + executable, agent, *tmp, nullptr, nullptr); + + hsa_executable_freeze(executable, nullptr); + + static vector code_readers; + static mutex mtx; + + lock_guard lck{mtx}; + code_readers.push_back(move(tmp)); + } +} + +namespace hip_impl +{ + const unordered_map& function_names() + { + static constexpr const char self[] = "/proc/self/exe"; + + static unordered_map r; + static once_flag f; + + call_once(f, []() { + elfio reader; + + if (!reader.load(self)) { + throw runtime_error{ + "Failed to load the ELF file for the current process."}; + } + + auto symtab = find_section_if(reader, [](const section* x) { + return x->get_type() == SHT_SYMTAB; + }); + + symbol_section_accessor symbols{reader, symtab}; + + for (auto i = 0u; i != symbols.get_symbols_num(); ++i) { + // TODO: this is boyscout code, caching the temporaries + // may be of worth. + string name; + Elf64_Addr value = 0; + Elf_Xword size = 0; + Elf_Half sect_idx = 0; + uint8_t bind = 0; + uint8_t type = 0; + uint8_t other = 0; + + symbols.get_symbol( + i, name, value, size, bind, type, sect_idx, other); + + if (type == STT_FUNC && sect_idx != SHN_UNDEF && !name.empty()) { + r.emplace(value, name); + } + } + }); + + return r; + } + + const unordered_map< + uintptr_t, vector>>& functions() + { + static unordered_map< + uintptr_t, vector>> r; + static once_flag f; + + call_once(f, []() { + for (auto&& function : function_names()) { + const auto it = kernels().find(function.second); + + if (it != kernels().cend()) { + for (auto&& kernel_symbol : it->second) { + r[function.first].emplace_back( + agent(kernel_symbol), + Kernel_descriptor{ + kernel_object(kernel_symbol), + group_size(kernel_symbol), + private_size(kernel_symbol), + it->first}); + } + } + } + }); + + return r; + } + + hsa_executable_t load_executable( + hsa_executable_t executable, hsa_agent_t agent, istream& file) + { + elfio reader; + if (!reader.load(file)) { + return hsa_executable_t{}; + } + else { + // TODO: this may benefit from caching as well. + elfio self_reader; + self_reader.load("/proc/self/exe"); + + const auto symtab = + find_section_if(self_reader, [](const ELFIO::section* x) { + return x->get_type() == SHT_SYMTAB; + }); + + const auto code_object_dynsym = + find_section_if(reader, [](const ELFIO::section* x) { + return x->get_type() == SHT_DYNSYM; + }); + + associate_code_object_symbols_with_host_allocation( + reader, self_reader, code_object_dynsym, symtab, agent, executable); + + load_code_object_and_freeze_executable(file, agent, executable); + + return executable; + } + } +} // Namespace hip_impl. \ No newline at end of file diff --git a/hipamd/tests/src/context/hipMemsetD8.cpp b/hipamd/tests/src/context/hipMemsetD8.cpp index 3730fcb70b..a356d05b76 100644 --- a/hipamd/tests/src/context/hipMemsetD8.cpp +++ b/hipamd/tests/src/context/hipMemsetD8.cpp @@ -46,7 +46,6 @@ int main(int argc, char *argv[]) A_h = new char[Nbytes]; HIPCHECK ( hipMalloc((void **) &A_d, Nbytes) ); - A_h = (char*)malloc(Nbytes); printf ("Size=%zu memsetval=%2x \n", Nbytes, memsetval); HIPCHECK ( hipMemsetD8(A_d, memsetval, Nbytes) ); @@ -61,7 +60,7 @@ int main(int argc, char *argv[]) } hipFree((void *) A_d); - free(A_h); + delete [] A_h; passed(); } diff --git a/hipamd/tests/src/deviceLib/hipTestDevice.cpp b/hipamd/tests/src/deviceLib/hipTestDevice.cpp index 570f3baaf0..fa85940839 100644 --- a/hipamd/tests/src/deviceLib/hipTestDevice.cpp +++ b/hipamd/tests/src/deviceLib/hipTestDevice.cpp @@ -139,7 +139,14 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); + if(passed == 1){ return true; } @@ -174,7 +181,14 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); + if(passed == 1){ return true; } @@ -205,7 +219,13 @@ for(int i=0;i<512;i++){ } } -free(A); +delete [] A; +delete [] B; +delete [] C; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); + if(passed == 1){ return true; } @@ -234,7 +254,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -263,7 +288,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -291,7 +321,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -321,7 +356,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -350,7 +390,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -387,7 +432,16 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +delete [] D; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); +hipFree(Dd); + if(passed == 1){ return true; } @@ -427,7 +481,18 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +delete [] D; +delete [] E; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); +hipFree(Dd); +hipFree(Ed); + if(passed == 1){ return true; } @@ -457,7 +522,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -489,7 +559,14 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); + if(passed == 1){ return true; } @@ -525,7 +602,16 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +delete [] D; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); +hipFree(Dd); + if(passed == 1){ return true; } @@ -565,7 +651,18 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +delete [] D; +delete [] E; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); +hipFree(Dd); +hipFree(Ed); + if(passed == 1){ return true; } @@ -595,7 +692,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -622,7 +724,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -631,7 +738,7 @@ return false; } int main(){ -if(run_sincosf() && run_sincospif() && run_fdividef() && +if(run_sincosf() && run_sincospif() && run_fdividef() && run_llrintf() && run_norm3df() && run_norm4df() && run_normf() && run_rnorm3df() && run_rnorm4df() && run_rnormf() && run_lroundf() && run_llroundf() && diff --git a/hipamd/tests/src/deviceLib/hipTestDeviceDouble.cpp b/hipamd/tests/src/deviceLib/hipTestDeviceDouble.cpp index 5bdbbf1b8f..3b919d0cab 100644 --- a/hipamd/tests/src/deviceLib/hipTestDeviceDouble.cpp +++ b/hipamd/tests/src/deviceLib/hipTestDeviceDouble.cpp @@ -128,7 +128,14 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); + if(passed == 1){ return true; } @@ -163,7 +170,14 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); + if(passed == 1){ return true; } @@ -193,7 +207,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -221,7 +240,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -249,7 +273,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -278,7 +307,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -306,7 +340,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -343,7 +382,16 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +delete [] D; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); +hipFree(Dd); + if(passed == 1){ return true; } @@ -383,7 +431,18 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +delete [] D; +delete [] E; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); +hipFree(Dd); +hipFree(Ed); + if(passed == 1){ return true; } @@ -416,7 +475,14 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); + if(passed == 1){ return true; } @@ -452,7 +518,16 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +delete [] D; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); +hipFree(Dd); + if(passed == 1){ return true; } @@ -492,7 +567,18 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +delete [] C; +delete [] D; +delete [] E; +hipFree(Ad); +hipFree(Bd); +hipFree(Cd); +hipFree(Dd); +hipFree(Ed); + if(passed == 1){ return true; } @@ -522,7 +608,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } @@ -549,7 +640,12 @@ for(int i=0;i<512;i++){ passed = 1; } } -free(A); + +delete [] A; +delete [] B; +hipFree(Ad); +hipFree(Bd); + if(passed == 1){ return true; } diff --git a/hipamd/tests/src/deviceLib/hip_test_ldg.cpp b/hipamd/tests/src/deviceLib/hip_test_ldg.cpp index 171ff1afd0..5540c4917d 100644 --- a/hipamd/tests/src/deviceLib/hip_test_ldg.cpp +++ b/hipamd/tests/src/deviceLib/hip_test_ldg.cpp @@ -159,11 +159,16 @@ bool dataTypesRun(){ HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice)); - hipLaunchKernel(vectoradd_float, - dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), - dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), - 0, 0, - deviceA ,deviceB ,WIDTH ,HEIGHT); + hipLaunchKernel( + vectoradd_float, + dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), + 0, + 0, + deviceA, + static_cast(deviceB), + WIDTH, + HEIGHT); HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(T), hipMemcpyDeviceToHost)); @@ -221,11 +226,16 @@ bool dataTypesRun2(){ HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T))); HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice)); - hipLaunchKernel(vectoradd_float, - dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), - dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), - 0, 0, - deviceA ,deviceB,WIDTH ,HEIGHT); + hipLaunchKernel( + vectoradd_float, + dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), + 0, + 0, + deviceA, + static_cast(deviceB), + WIDTH, + HEIGHT); HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(T), hipMemcpyDeviceToHost)); @@ -281,11 +291,16 @@ bool dataTypesRun4(){ HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice)); - hipLaunchKernel(vectoradd_float, - dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), - dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), - 0, 0, - deviceA ,deviceB ,WIDTH ,HEIGHT); + hipLaunchKernel( + vectoradd_float, + dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), + 0, + 0, + deviceA, + static_cast(deviceB), + WIDTH, + HEIGHT); HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(T), hipMemcpyDeviceToHost)); diff --git a/hipamd/tests/src/experimental/xcompile/hipxxKer.cpp b/hipamd/tests/src/experimental/xcompile/hipxxKer.cpp index 79a272aaf2..d1bbed63cd 100644 --- a/hipamd/tests/src/experimental/xcompile/hipxxKer.cpp +++ b/hipamd/tests/src/experimental/xcompile/hipxxKer.cpp @@ -36,17 +36,23 @@ __global__ void Kern(hipLaunchParm lp, float *A) int main() { - float *A, *Ad; + float A[len]; + float *Ad; + for(int i=0;i(A_d), + static_cast(B_d), + C_d, + N); HIPCHECK (hipEventRecord(stop, NULL)); diff --git a/hipamd/tests/src/runtimeApi/event/record_event.cpp b/hipamd/tests/src/runtimeApi/event/record_event.cpp index bd8a3ada8e..a7b99749cb 100644 --- a/hipamd/tests/src/runtimeApi/event/record_event.cpp +++ b/hipamd/tests/src/runtimeApi/event/record_event.cpp @@ -52,7 +52,7 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_ if (!(testMask & p_tests)) { return; } - printf ("\ntest 0x%3x: stream=%p waitStart=%d syncMode=%s\n", + printf ("\ntest 0x%3x: stream=%p waitStart=%d syncMode=%s\n", testMask, stream, waitStart, syncModeString(syncMode)); size_t sizeBytes = numElements * sizeof(int); @@ -77,7 +77,16 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_ HIPCHECK(hipEventRecord(timingDisabled, stream)); // sandwhich a kernel: HIPCHECK(hipEventRecord(start, stream)); - hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, stream, C_d, C_h, numElements, count); + hipLaunchKernelGGL( + HipTest::addCountReverse, + dim3(blocks), + dim3(threadsPerBlock), + 0, + stream, + static_cast(C_d), + C_h, + numElements, + count); HIPCHECK(hipEventRecord(stop, stream)); @@ -85,8 +94,8 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_ HIPCHECK(hipEventSynchronize(start)); } - - hipError_t expectedStopError = hipSuccess; + + hipError_t expectedStopError = hipSuccess; // How to wait for the events to finish: switch (syncMode) { @@ -97,12 +106,12 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_ HIPCHECK(hipStreamSynchronize(stream)); // wait for recording to finish... break; case syncStopEvent: - HIPCHECK(hipEventSynchronize(stop)); + HIPCHECK(hipEventSynchronize(stop)); break; default: assert(0); }; - + float t; @@ -111,25 +120,25 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_ failed ("start event not in expected state, was %d=%s\n", e, hipGetErrorName(e)); } - if (e == hipSuccess) + if (e == hipSuccess) assert (t==0.0f); - + // stop usually ready unless we skipped the synchronization (syncNone) HIPCHECK_API(hipEventElapsedTime(&t, stop, stop), expectedStopError); - if (e == hipSuccess) + if (e == hipSuccess) assert (t==0.0f); e = hipEventElapsedTime(&t, start, stop); HIPCHECK_API(e, expectedStopError); - if (expectedStopError == hipSuccess) + if (expectedStopError == hipSuccess) assert (t>0.0f); printf ("time=%6.2f error=%s\n", t, hipGetErrorName(e)); e = hipEventElapsedTime(&t, stop, start); HIPCHECK_API(e, expectedStopError); - if (expectedStopError == hipSuccess) + if (expectedStopError == hipSuccess) assert (t<0.0f); printf ("negtime=%6.2f error=%s\n", t, hipGetErrorName(e)); diff --git a/hipamd/tests/src/runtimeApi/memory/hipMemcpy.cpp b/hipamd/tests/src/runtimeApi/memory/hipMemcpy.cpp index e8e803e44c..b3f25658fc 100644 --- a/hipamd/tests/src/runtimeApi/memory/hipMemcpy.cpp +++ b/hipamd/tests/src/runtimeApi/memory/hipMemcpy.cpp @@ -58,7 +58,7 @@ public: void offset(int offset) { _offset = offset; }; int offset() const { return _offset; }; - + private: T * _A_d; T* _B_d; @@ -72,7 +72,7 @@ private: template DeviceMemory::DeviceMemory(size_t numElements) - : _maxNumElements(numElements), + : _maxNumElements(numElements), _offset(0) { T ** np = nullptr; @@ -93,7 +93,7 @@ DeviceMemory::~DeviceMemory () HipTest::freeArrays (_A_d, _B_d, _C_d, np, np, np, 0); HIPCHECK (hipFree(_C_dd)); - + _C_dd = NULL; }; @@ -125,7 +125,7 @@ public: T * A_hh; T* B_hh; - bool _usePinnedHost; + bool _usePinnedHost; private: size_t _maxNumElements; @@ -165,11 +165,11 @@ HostMemory::HostMemory(size_t numElements, bool usePinnedHost) template void -HostMemory::reset(size_t numElements, bool full) +HostMemory::reset(size_t numElements, bool full) { // Initialize the host data: for (size_t i=0; i void memcpytest2(DeviceMemory *dmem, HostMemory *hmem, size_t numElements, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) { size_t sizeElements = numElements * sizeof(T); - printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d, offsets:dev:%+d host:+%d\n", - __func__, + printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d, offsets:dev:%+d host:+%d\n", + __func__, TYPENAME(T), sizeElements, sizeElements/1024.0/1024.0, hmem->_usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault, @@ -243,7 +243,16 @@ void memcpytest2(DeviceMemory *dmem, HostMemory *hmem, size_t numElements, HIPCHECK ( hipMemcpy(dmem->B_d(), hmem->B_h(), sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); } - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, dmem->A_d(), dmem->B_d(), dmem->C_d(), numElements); + hipLaunchKernel( + HipTest::vectorADD, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(dmem->A_d()), + static_cast(dmem->B_d()), + dmem->C_d(), + numElements); if (useDeviceToDevice) { // Do an extra device-to-device copy here to mix things up: @@ -273,8 +282,8 @@ void memcpytest2_for_type(size_t numElements) { printSep(); - DeviceMemory memD(numElements); - HostMemory memU(numElements, 0/*usePinnedHost*/); + DeviceMemory memD(numElements); + HostMemory memU(numElements, 0/*usePinnedHost*/); HostMemory memP(numElements, 1/*usePinnedHost*/); for (int usePinnedHost =0; usePinnedHost<=1; usePinnedHost++) { @@ -307,11 +316,11 @@ void memcpytest2_sizes(size_t maxElem=0) maxElem = free/sizeof(T)/8; } - printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n", + printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n", deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0); HIPCHECK ( hipDeviceReset() ); - DeviceMemory memD(maxElem); - HostMemory memU(maxElem, 0/*usePinnedHost*/); + DeviceMemory memD(maxElem); + HostMemory memU(maxElem, 0/*usePinnedHost*/); HostMemory memP(maxElem, 1/*usePinnedHost*/); for (size_t elem=1; elem<=maxElem; elem*=2) { @@ -336,11 +345,11 @@ void memcpytest2_offsets(size_t maxElem, bool devOffsets, bool hostOffsets) HIPCHECK(hipMemGetInfo(&free, &total)); - printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n", + printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n", deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0); HIPCHECK ( hipDeviceReset() ); - DeviceMemory memD(maxElem); - HostMemory memU(maxElem, 0/*usePinnedHost*/); + DeviceMemory memD(maxElem); + HostMemory memU(maxElem, 0/*usePinnedHost*/); HostMemory memP(maxElem, 1/*usePinnedHost*/); size_t elem = maxElem / 2; @@ -380,16 +389,16 @@ void multiThread_1(bool serialize, bool usePinnedHost) { printSep(); printf ("test: %s<%s> serialize=%d usePinnedHost=%d\n", __func__, TYPENAME(T), serialize, usePinnedHost); - DeviceMemory memD(N); - HostMemory mem1(N, usePinnedHost); - HostMemory mem2(N, usePinnedHost); + DeviceMemory memD(N); + HostMemory mem1(N, usePinnedHost); + HostMemory mem2(N, usePinnedHost); std::thread t1 (memcpytest2, &memD, &mem1, N, 0,0,0); if (serialize) { t1.join(); } - + std::thread t2 (memcpytest2,&memD, &mem2, N, 0,0,0); if (serialize) { t2.join(); @@ -427,21 +436,21 @@ int main(int argc, char *argv[]) // Some tests around the 64KB boundary which have historically shown issues: printf ("\n\n=== tests&0x2 (64KB boundary)\n"); size_t maxElem = 32*1024*1024; - DeviceMemory memD(maxElem); - HostMemory memU(maxElem, 0/*usePinnedHost*/); - HostMemory memP(maxElem, 0/*usePinnedHost*/); + DeviceMemory memD(maxElem); + HostMemory memU(maxElem, 0/*usePinnedHost*/); + HostMemory memP(maxElem, 0/*usePinnedHost*/); // These all pass: - memcpytest2(&memD, &memP, 15*1024*1024, 0, 0, 0); - memcpytest2(&memD, &memP, 16*1024*1024, 0, 0, 0); - memcpytest2(&memD, &memP, 16*1024*1024+16*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 15*1024*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 16*1024*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 16*1024*1024+16*1024, 0, 0, 0); // Just over 64MB: - memcpytest2(&memD, &memP, 16*1024*1024+512*1024, 0, 0, 0); - memcpytest2(&memD, &memP, 17*1024*1024+1024, 0, 0, 0); - memcpytest2(&memD, &memP, 32*1024*1024, 0, 0, 0); - memcpytest2(&memD, &memU, 32*1024*1024, 0, 0, 0); - memcpytest2(&memD, &memP, 32*1024*1024, 1, 1, 0); - memcpytest2(&memD, &memP, 32*1024*1024, 1, 1, 0); + memcpytest2(&memD, &memP, 16*1024*1024+512*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 17*1024*1024+1024, 0, 0, 0); + memcpytest2(&memD, &memP, 32*1024*1024, 0, 0, 0); + memcpytest2(&memD, &memU, 32*1024*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 32*1024*1024, 1, 1, 0); + memcpytest2(&memD, &memP, 32*1024*1024, 1, 1, 0); } @@ -464,7 +473,7 @@ int main(int argc, char *argv[]) // Simplest cases: serialize the threads, and also used pinned memory: // This verifies that the sub-calls to memcpytest2 are correct. - multiThread_1(true, true); + multiThread_1(true, true); // Serialize, but use unpinned memory to stress the unpinned memory xfer path. multiThread_1(true, false); diff --git a/hipamd/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp b/hipamd/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp index 316f50c01b..9a09e7e95c 100644 --- a/hipamd/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp +++ b/hipamd/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp @@ -63,7 +63,16 @@ void simpleTest1() HIPCHECK ( memcopy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); HIPCHECK ( memcopy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N); + hipLaunchKernel( + HipTest::vectorADD, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(A_d), + static_cast(B_d), + C_d, + N); HIPCHECK ( memcopy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); diff --git a/hipamd/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp b/hipamd/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp index 4f73b67ad7..9d274543ab 100644 --- a/hipamd/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp +++ b/hipamd/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp @@ -41,8 +41,8 @@ void printSep() // Designed to stress a small number of simple smoke tests template< - typename T=float, - class P=HipTest::Unpinned, + typename T=float, + class P=HipTest::Unpinned, class C=HipTest::Memcpy > void simpleVectorAdd(size_t numElements, int iters, hipStream_t stream) @@ -90,7 +90,16 @@ void simpleVectorAdd(size_t numElements, int iters, hipStream_t stream) // This is the null stream? //hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); - hipLaunchKernel(HipTest::vectorADDReverse, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); + hipLaunchKernel( + HipTest::vectorADDReverse, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(A_d), + static_cast(B_d), + C_d, + numElements); MemTraits::Copy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, stream); diff --git a/hipamd/tests/src/runtimeApi/stream/hipNullStream.cpp b/hipamd/tests/src/runtimeApi/stream/hipNullStream.cpp index b610315608..04a232f3bb 100644 --- a/hipamd/tests/src/runtimeApi/stream/hipNullStream.cpp +++ b/hipamd/tests/src/runtimeApi/stream/hipNullStream.cpp @@ -119,7 +119,7 @@ void Streamer::reset() { HipTest::setDefaultData(_numElements, _A_h, _B_h, _C_h); H2D(); - + } @@ -128,7 +128,17 @@ void Streamer::enqueAsync() { printf ("testing: %s numElements=%zu size=%6.2fMB\n", __func__, _numElements, _numElements * sizeof(T) / 1024.0/1024.0); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements); - hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _B_d, _C_d, _numElements, p_repeat); + hipLaunchKernel( + vectorADDRepeat, + dim3(blocks), + dim3(threadsPerBlock), + 0, + _stream, + static_cast(_A_d), + static_cast(_B_d), + _C_d, + _numElements, + p_repeat); } @@ -225,7 +235,17 @@ int main(int argc, char *argv[]) auto lastStreamer = streamers[s - 1]; // Dispatch to NULL stream, should wait for prior async activity to complete before beginning: - hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/); + hipLaunchKernel( + vectorADDRepeat, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0/*nullstream*/, + static_cast(lastStreamer->_C_d), + static_cast(lastStreamer->_C_d), + nullStreamer->_C_d, + numElements, + 1/*repeat*/); if (p_db) { @@ -238,7 +258,7 @@ int main(int argc, char *argv[]) nullStreamer->D2H(); HIPCHECK(hipDeviceSynchronize()); - HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); + HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); } } @@ -257,13 +277,23 @@ int main(int argc, char *argv[]) auto lastStreamer = streamers[s - 1]; // Dispatch to NULL stream, should wait for prior async activity to complete before beginning: - hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/); + hipLaunchKernel( + vectorADDRepeat, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0/*nullstream*/, + static_cast(lastStreamer->_C_d), + static_cast(lastStreamer->_C_d), + nullStreamer->_C_d, + numElements, + 1/*repeat*/); nullStreamer->D2H(); HIPCHECK(hipDeviceSynchronize()); - HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); + HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); } } @@ -289,10 +319,10 @@ int main(int argc, char *argv[]) // Copy with stream1, this could go async if the streamSync doesn't synchronize ALL the streams. HIPCHECK(hipMemcpyAsync(streamers[0]->_C_h, streamers[0]->_C_d, streamers[0]->_numElements*sizeof(int), hipMemcpyDeviceToHost, streamers[1]->_stream)); - + HIPCHECK(hipDeviceSynchronize()); - HipTest::checkTest(expected_H, streamers[0]->_C_h, numElements); + HipTest::checkTest(expected_H, streamers[0]->_C_h, numElements); } diff --git a/hipamd/tests/src/runtimeApi/stream/hipStreamSync2.cpp b/hipamd/tests/src/runtimeApi/stream/hipStreamSync2.cpp index c6a58ce7d4..962737774d 100644 --- a/hipamd/tests/src/runtimeApi/stream/hipStreamSync2.cpp +++ b/hipamd/tests/src/runtimeApi/stream/hipStreamSync2.cpp @@ -59,23 +59,23 @@ const char *syncModeString(int syncMode) { void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode syncMode, bool expectMismatch) { - // This test sends a long-running kernel to the null stream, then tests to see if the + // This test sends a long-running kernel to the null stream, then tests to see if the // specified synchronization technique is effective. // - // Some syncMode are not expected to correctly sync (for example "syncNone"). in these + // Some syncMode are not expected to correctly sync (for example "syncNone"). in these // cases the test sets expectMismatch and the check logic below will attempt to ensure that // the undesired synchronization did not occur - ie ensure the kernel is still running and did // not yet update the stop event. This can be tricky since if the kernel runs fast enough it - // may complete before the check. To prevent this, the addCountReverse has a count parameter - // which causes it to loop repeatedly, and the results are checked in reverse order. + // may complete before the check. To prevent this, the addCountReverse has a count parameter + // which causes it to loop repeatedly, and the results are checked in reverse order. // // Tests with expectMismatch=true should ensure the kernel finishes correctly. This results // are checked and we test to make sure stop event has completed. - + if (!(testMask & p_tests)) { return; } - printf ("\ntest 0x%02x: syncMode=%s expectMismatch=%d\n", + printf ("\ntest 0x%02x: syncMode=%s expectMismatch=%d\n", testMask, syncModeString(syncMode), expectMismatch); size_t sizeBytes = numElements * sizeof(int); @@ -97,8 +97,17 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode s unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); // Launch kernel into null stream, should result in C_h == count. - hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, 0 /*stream*/, C_d, C_h, numElements, count); - HIPCHECK(hipEventRecord(stop, 0/*default*/)); + hipLaunchKernelGGL( + HipTest::addCountReverse, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0 /*stream*/, + static_cast(C_d), + C_h, + numElements, + count); + HIPCHECK(hipEventRecord(stop, 0/*default*/)); switch (syncMode) { case syncNone: @@ -108,18 +117,18 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode s break; case syncOtherStream: // Does this synchronize with the null stream? - HIPCHECK(hipStreamSynchronize(otherStream)); + HIPCHECK(hipStreamSynchronize(otherStream)); break; case syncMarkerThenOtherStream: case syncMarkerThenOtherNonBlockingStream: - - // this may wait for NULL stream depending hipStreamNonBlocking flag above - HIPCHECK(hipEventRecord(otherStreamEvent, otherStream)); - HIPCHECK(hipStreamSynchronize(otherStream)); + // this may wait for NULL stream depending hipStreamNonBlocking flag above + HIPCHECK(hipEventRecord(otherStreamEvent, otherStream)); + + HIPCHECK(hipStreamSynchronize(otherStream)); break; case syncDevice: - HIPCHECK(hipDeviceSynchronize()); + HIPCHECK(hipDeviceSynchronize()); break; default: assert(0); @@ -197,7 +206,7 @@ void runTests(int64_t numElements) int main(int argc, char *argv[]) { // Can' destroy the default stream:// TODO - move to another test - HIPCHECK_API(hipStreamDestroy(0), hipErrorInvalidResourceHandle); + HIPCHECK_API(hipStreamDestroy(0), hipErrorInvalidResourceHandle); HipTest::parseStandardArguments(argc, argv, true /*failOnUndefinedArg*/); diff --git a/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp b/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp index f5b1b79550..a7a930b4f6 100644 --- a/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp +++ b/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp @@ -88,7 +88,7 @@ private: template Streamer::Streamer(int deviceId, T * A_d, size_t numElements, int commandType) : - _preA_d(NULL), + _preA_d(NULL), _A_d(A_d), _deviceId(deviceId), _numElements(numElements), @@ -163,9 +163,27 @@ void Streamer::runAsyncAfter(Streamer *depStreamer, bool waitSameStream) unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements); if (_commandType == COMMAND_ADD_REVERSE) { - hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); + hipLaunchKernelGGL( + HipTest::addCountReverse, + dim3(blocks), + dim3(threadsPerBlock), + 0, + _stream, + static_cast(_A_d), + _C_d, + static_cast(_numElements), + static_cast(p_count)); } else if (_commandType == COMMAND_ADD_FORWARD) { - hipLaunchKernelGGL(HipTest::addCount, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); + hipLaunchKernelGGL( + HipTest::addCount, + dim3(blocks), + dim3(threadsPerBlock), + 0, + _stream, + static_cast(_A_d), + _C_d, + _numElements, + static_cast(p_count)); } else if (_commandType == COMMAND_COPY) { HIPCHECK(hipMemcpyAsync(_C_d, _A_d, _numElements * sizeof(T), hipMemcpyDeviceToDevice, _stream)); } else { @@ -239,7 +257,7 @@ size_t Streamer::check(int streamerNum, T initValue, T expectedOffset, bool e return _mismatchCount; } - + //--- //Parse arguments specific to this test. @@ -300,7 +318,7 @@ void checkAll(int initValue, std::vector &streamers, std::vector< for (int i=0; iexpectedAdd(); - + mismatchCount += streamers[i]->check(i+1, initValue, expected, expectPass); } @@ -330,7 +348,7 @@ void checkAll(int initValue, std::vector &streamers, std::vector< void sync_none(void) {}; -void sync_allDevices(int numDevices) +void sync_allDevices(int numDevices) { for (int d=0; d streamers) +void sync_queryAllUntilComplete(std::vector streamers) { for (int i=streamers.size()-1; i>=0; i--) { streamers[i]->queryUntilComplete(); @@ -347,7 +365,7 @@ void sync_queryAllUntilComplete(std::vector streamers) } -void sync_streamWaitEvent(hipEvent_t lastEvent, int sideDeviceId, hipStream_t sideStream, bool waitHere) +void sync_streamWaitEvent(hipEvent_t lastEvent, int sideDeviceId, hipStream_t sideStream, bool waitHere) { HIPCHECK(hipSetDevice(sideDeviceId)); @@ -389,7 +407,7 @@ int main(int argc, char *argv[]) initArray_h[i] = initValue; } HIPCHECK(hipMemcpy(initArray_d, initArray_h, sizeElements, hipMemcpyHostToDevice)); - + int numDevices; HIPCHECK(hipGetDeviceCount(&numDevices)); @@ -414,7 +432,7 @@ int main(int argc, char *argv[]) // A sideband stream channel that is independent from above. - // Used to check to ensure the WaitEvent or other synchronization is working correctly since by default sideStream is + // Used to check to ensure the WaitEvent or other synchronization is working correctly since by default sideStream is // asynchronous wrt the other streams. std::vector sideStreams; for (int d=0; d Test 0x1000 simple null stream tests\n"); + printf ("==> Test 0x1000 simple null stream tests\n"); // try some null stream: hipStreamQuery(0); @@ -463,7 +481,7 @@ int main(int argc, char *argv[]) HIPCHECK(hipEventRecord(e1, s1)) HIPCHECK(hipStreamWaitEvent(hipStream_t(0), e1, 0/*flags*/)); - + HIPCHECK(hipStreamDestroy(s1)); HIPCHECK(hipEventDestroy(e1)); } @@ -476,11 +494,11 @@ int main(int argc, char *argv[]) HIPCHECK(hipEventRecord(e1, hipStream_t(0))) HIPCHECK(hipStreamWaitEvent(s1, e1, 0/*flags*/)); - + HIPCHECK(hipStreamDestroy(s1)); HIPCHECK(hipEventDestroy(e1)); } - + } diff --git a/hipamd/tests/src/stress/hipStressKernel.cpp b/hipamd/tests/src/stress/hipStressKernel.cpp index 7b5eec5a80..52d8fa1fe9 100644 --- a/hipamd/tests/src/stress/hipStressKernel.cpp +++ b/hipamd/tests/src/stress/hipStressKernel.cpp @@ -57,5 +57,8 @@ int main(){ } std::cout< Date: Wed, 1 Nov 2017 22:33:13 +0000 Subject: [PATCH 02/27] Correctly deal with functions from shared objects, wherein the program visible VA == so_base_va + st_value(function_symbol). Remove quaint usage of pfe for hipMemset (which is actually fill_n). --- hipamd/src/hip_memory.cpp | 133 ++++++++++++++++------------------- hipamd/src/program_state.cpp | 108 +++++++++++++++++++--------- 2 files changed, 136 insertions(+), 105 deletions(-) diff --git a/hipamd/src/hip_memory.cpp b/hipamd/src/hip_memory.cpp index 96fc25c27d..32e0016178 100644 --- a/hipamd/src/hip_memory.cpp +++ b/hipamd/src/hip_memory.cpp @@ -1153,42 +1153,56 @@ hipError_t hipMemcpy3D(const struct hipMemcpy3DParms *p) return ihipLogStatus(e); } -// TODO - make member function of stream? +namespace +{ + template< + uint32_t block_dim, + typename RandomAccessIterator, + typename N, + typename T> + __global__ + void hip_fill_n(RandomAccessIterator f, N n, T value) + { + const uint32_t grid_dim = hipGridDim_x; + + size_t idx = hipBlockIdx_x * block_dim + hipThreadIdx_x; + while (idx < n) { + new (&f[idx]) T{value}; + idx += grid_dim; + } + } + + template< + typename T, + typename std::enable_if{}>::type* = nullptr> + inline + const T& clamp_integer(const T& x, const T& lower, const T& upper) + { + assert(!(upper < lower)); + + return std::min(upper, std::max(x, lower)); + } +} + template void ihipMemsetKernel(hipStream_t stream, - LockedAccessor_StreamCrit_t &crit, - T * ptr, T val, size_t sizeBytes, - hc::completion_future *cf) + T * ptr, T val, size_t sizeBytes) { - int wg = std::min((unsigned)8, stream->getDevice()->_computeUnits); - const int threads_per_wg = 256; + static constexpr uint32_t block_dim = 256; - int threads = wg * threads_per_wg; - if (threads > sizeBytes) { - threads = ((sizeBytes + threads_per_wg - 1) / threads_per_wg) * threads_per_wg; - } - - - hc::extent<1> ext(threads); - auto ext_tile = ext.tile(threads_per_wg); - - *cf = - hc::parallel_for_each( - crit->_av, - ext_tile, - [=] (hc::tiled_index<1> idx) - __attribute__((hc)) - { - int offset = amp_get_global_id(0); - // TODO-HCC - change to hc_get_local_size() - int stride = amp_get_local_size(0) * hc_get_num_groups(0) ; - - for (int i=offset; i( + sizeBytes / block_dim, 1, UINT32_MAX); + hipLaunchKernelGGL( + hip_fill_n, + dim3(grid_dim), + dim3{block_dim}, + 0u, + stream, + ptr, + sizeBytes, + std::move(val)); } @@ -1202,17 +1216,12 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s stream = ihipSyncAndResolveStream(stream); if (stream) { - auto crit = stream->lockopen_preKernelCommand(); - - - hc::completion_future cf ; - if ((sizeBytes & 0x3) == 0) { // use a faster dword-per-workitem copy: try { value = value & 0xff; uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ; - ihipMemsetKernel (stream, crit, static_cast (dst), value32, sizeBytes/sizeof(uint32_t), &cf); + ihipMemsetKernel (stream, static_cast (dst), value32, sizeBytes/sizeof(uint32_t)); } catch (std::exception &ex) { e = hipErrorInvalidValue; @@ -1220,19 +1229,16 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s } else { // use a slow byte-per-workitem copy: try { - ihipMemsetKernel (stream, crit, static_cast (dst), value, sizeBytes, &cf); + ihipMemsetKernel (stream, static_cast (dst), value, sizeBytes); } catch (std::exception &ex) { e = hipErrorInvalidValue; } } - stream->lockclose_postKernelCommand("hipMemsetAsync", &crit->_av); - - if (HIP_API_BLOCKING) { tprintf (DB_SYNC, "%s LAUNCH_BLOCKING wait for hipMemsetAsync.\n", ToString(stream).c_str()); - cf.wait(); + stream->locked_wait(); } } else { e = hipErrorInvalidValue; @@ -1253,16 +1259,12 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes) stream = ihipSyncAndResolveStream(stream); if (stream) { - auto crit = stream->lockopen_preKernelCommand(); - - hc::completion_future cf ; - if ((sizeBytes & 0x3) == 0) { // use a faster dword-per-workitem copy: try { value = value & 0xff; uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ; - ihipMemsetKernel (stream, crit, static_cast (dst), value32, sizeBytes/sizeof(uint32_t), &cf); + ihipMemsetKernel (stream, static_cast (dst), value32, sizeBytes/sizeof(uint32_t)); } catch (std::exception &ex) { e = hipErrorInvalidValue; @@ -1270,21 +1272,18 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes) } else { // use a slow byte-per-workitem copy: try { - ihipMemsetKernel (stream, crit, static_cast (dst), value, sizeBytes, &cf); + ihipMemsetKernel (stream, static_cast (dst), value, sizeBytes); } catch (std::exception &ex) { e = hipErrorInvalidValue; } } // TODO - is hipMemset supposed to be async? - cf.wait(); - - stream->lockclose_postKernelCommand("hipMemset", &crit->_av); - + stream->locked_wait(); if (HIP_LAUNCH_BLOCKING) { tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset in %s.\n", __func__, ToString(stream).c_str()); - cf.wait(); + stream->locked_wait(); tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed in %s.\n", __func__, ToString(stream).c_str()); } } else { @@ -1305,17 +1304,13 @@ hipError_t hipMemset2D(void* dst, size_t pitch, int value, size_t width, size_t stream = ihipSyncAndResolveStream(stream); if (stream) { - auto crit = stream->lockopen_preKernelCommand(); - - hc::completion_future cf ; - size_t sizeBytes = pitch * height; if ((sizeBytes & 0x3) == 0) { // use a faster dword-per-workitem copy: try { value = value & 0xff; uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ; - ihipMemsetKernel (stream, crit, static_cast (dst), value32, sizeBytes/sizeof(uint32_t), &cf); + ihipMemsetKernel (stream, static_cast (dst), value32, sizeBytes/sizeof(uint32_t)); } catch (std::exception &ex) { e = hipErrorInvalidValue; @@ -1323,20 +1318,18 @@ hipError_t hipMemset2D(void* dst, size_t pitch, int value, size_t width, size_t } else { // use a slow byte-per-workitem copy: try { - ihipMemsetKernel (stream, crit, static_cast (dst), value, sizeBytes, &cf); + ihipMemsetKernel (stream, static_cast (dst), value, sizeBytes); } catch (std::exception &ex) { e = hipErrorInvalidValue; } } // TODO - is hipMemset supposed to be async? - cf.wait(); - - stream->lockclose_postKernelCommand("hipMemset", &crit->_av); + stream->locked_wait(); if (HIP_LAUNCH_BLOCKING) { tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset in %s.\n", __func__, ToString(stream).c_str()); - cf.wait(); + stream->locked_wait(); tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed in %s.\n", __func__, ToString(stream).c_str()); } } else { @@ -1357,36 +1350,30 @@ hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t sizeByte stream = ihipSyncAndResolveStream(stream); if (stream) { - auto crit = stream->lockopen_preKernelCommand(); - - hc::completion_future cf ; - if ((sizeBytes & 0x3) == 0) { // use a faster dword-per-workitem copy: try { uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ; - ihipMemsetKernel (stream, crit, static_cast (dst), value32, sizeBytes/sizeof(uint32_t), &cf); + ihipMemsetKernel (stream, static_cast (dst), value32, sizeBytes/sizeof(uint32_t)); } catch (std::exception &ex) { + std::cout << ex.what() << std::endl; e = hipErrorInvalidValue; } } else { // use a slow byte-per-workitem copy: try { - ihipMemsetKernel (stream, crit, static_cast (dst), value, sizeBytes, &cf); + ihipMemsetKernel (stream, static_cast (dst), value, sizeBytes); } catch (std::exception &ex) { e = hipErrorInvalidValue; } } - cf.wait(); - - stream->lockclose_postKernelCommand("hipMemsetD8", &crit->_av); - + stream->locked_wait(); if (HIP_LAUNCH_BLOCKING) { tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset in %s.\n", __func__, ToString(stream).c_str()); - cf.wait(); + stream->locked_wait(); tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed in %s.\n", __func__, ToString(stream).c_str()); } } else { diff --git a/hipamd/src/program_state.cpp b/hipamd/src/program_state.cpp index be871a6e84..a4f7fdbdbe 100644 --- a/hipamd/src/program_state.cpp +++ b/hipamd/src/program_state.cpp @@ -195,9 +195,9 @@ namespace static vector> blobs{ code_object_blob_for_process()}; - dl_iterate_phdr([](dl_phdr_info* i, std::size_t, void*) { + dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void*) { elfio tmp; - if (tmp.load(i->dlpi_name)) { + if (tmp.load(info->dlpi_name)) { const auto it = find_section_if(tmp, [](const section* x) { return x->get_name() == ".kernel"; }); @@ -269,6 +269,61 @@ namespace return r; } + vector> function_names_for( + const elfio& reader, section* symtab) + { + vector> r; + symbol_section_accessor symbols{reader, symtab}; + + auto foo = reader.get_entry(); + + for (auto i = 0u; i != symbols.get_symbols_num(); ++i) { + // TODO: this is boyscout code, caching the temporaries + // may be of worth. + string name; + Elf64_Addr value = 0; + Elf_Xword size = 0; + Elf_Half sect_idx = 0; + uint8_t bind = 0; + uint8_t type = 0; + uint8_t other = 0; + + symbols.get_symbol( + i, name, value, size, bind, type, sect_idx, other); + + if (type == STT_FUNC && sect_idx != SHN_UNDEF && !name.empty()) { + r.emplace_back(value, name); + } + } + + return r; + } + + const vector>& function_names_for_process() + { + static constexpr const char self[] = "/proc/self/exe"; + + static vector> r; + static once_flag f; + + call_once(f, []() { + elfio reader; + + if (!reader.load(self)) { + throw runtime_error{ + "Failed to load the ELF file for the current process."}; + } + + auto symtab = find_section_if(reader, [](const section* x) { + return x->get_type() == SHT_SYMTAB; + }); + + r = function_names_for(reader, symtab); + }); + + return r; + } + inline hsa_agent_t agent(hsa_executable_symbol_t x) { @@ -395,43 +450,32 @@ namespace hip_impl { const unordered_map& function_names() { - static constexpr const char self[] = "/proc/self/exe"; - - static unordered_map r; + static unordered_map r{ + function_names_for_process().cbegin(), + function_names_for_process().cend()}; static once_flag f; call_once(f, []() { - elfio reader; + dl_iterate_phdr([](dl_phdr_info* info, size_t, void*) { + elfio tmp; + if (tmp.load(info->dlpi_name)) { + const auto it = find_section_if(tmp, [](const section* x) { + return x->get_type() == SHT_SYMTAB; + }); - if (!reader.load(self)) { - throw runtime_error{ - "Failed to load the ELF file for the current process."}; - } + if (it) { + auto n = function_names_for(tmp, it); - auto symtab = find_section_if(reader, [](const section* x) { - return x->get_type() == SHT_SYMTAB; - }); + for (auto&& f : n) f.first += info->dlpi_addr; - symbol_section_accessor symbols{reader, symtab}; - - for (auto i = 0u; i != symbols.get_symbols_num(); ++i) { - // TODO: this is boyscout code, caching the temporaries - // may be of worth. - string name; - Elf64_Addr value = 0; - Elf_Xword size = 0; - Elf_Half sect_idx = 0; - uint8_t bind = 0; - uint8_t type = 0; - uint8_t other = 0; - - symbols.get_symbol( - i, name, value, size, bind, type, sect_idx, other); - - if (type == STT_FUNC && sect_idx != SHN_UNDEF && !name.empty()) { - r.emplace(value, name); + r.insert( + make_move_iterator(n.begin()), + make_move_iterator(n.end())); + } } - } + + return 0; + }, nullptr); }); return r; From 8c62d0fbc42ec0cf2cabc24187ec8e936c554ab9 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Fri, 10 Nov 2017 00:14:34 +0000 Subject: [PATCH 03/27] Update new tests so as to make them work with new variadic based launch mechanisms. --- .../src/runtimeApi/memory/hipMemcpyDtoD.cpp | 34 ++++++++++++++----- .../runtimeApi/memory/hipMemcpyDtoDAsync.cpp | 34 ++++++++++++++----- .../src/runtimeApi/memory/hipMemcpyPeer.cpp | 30 ++++++++++++---- .../runtimeApi/memory/hipMemcpyPeerAsync.cpp | 32 +++++++++++++---- 4 files changed, 101 insertions(+), 29 deletions(-) diff --git a/hipamd/tests/src/runtimeApi/memory/hipMemcpyDtoD.cpp b/hipamd/tests/src/runtimeApi/memory/hipMemcpyDtoD.cpp index ccb02b74ce..c64b01f8a7 100644 --- a/hipamd/tests/src/runtimeApi/memory/hipMemcpyDtoD.cpp +++ b/hipamd/tests/src/runtimeApi/memory/hipMemcpyDtoD.cpp @@ -49,21 +49,39 @@ int main() HIPCHECK(hipMalloc(&Y_d,Nbytes)); HIPCHECK(hipMalloc(&Z_d,Nbytes)); - + HIPCHECK(hipSetDevice(0)); HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N); + hipLaunchKernel( + HipTest::vectorADD, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(A_d), + static_cast(B_d), + C_d, + N); HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); HIPCHECK(hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); - - + + HIPCHECK(hipSetDevice(1)); - HIPCHECK(hipMemcpyDtoD(X_d, A_d, Nbytes)); + HIPCHECK(hipMemcpyDtoD(X_d, A_d, Nbytes)); HIPCHECK(hipMemcpyDtoD(Y_d, B_d, Nbytes)); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N); + hipLaunchKernel( + HipTest::vectorADD, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(X_d), + static_cast(Y_d), + Z_d, + N); HIPCHECK(hipMemcpyDtoH(C_h, Z_d, Nbytes)); HIPCHECK(hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); @@ -73,8 +91,8 @@ int main() HIPCHECK(hipFree(Y_d)); HIPCHECK(hipFree(Z_d)); } - + passed(); - + } diff --git a/hipamd/tests/src/runtimeApi/memory/hipMemcpyDtoDAsync.cpp b/hipamd/tests/src/runtimeApi/memory/hipMemcpyDtoDAsync.cpp index 5c99b43564..6d21ac62e7 100644 --- a/hipamd/tests/src/runtimeApi/memory/hipMemcpyDtoDAsync.cpp +++ b/hipamd/tests/src/runtimeApi/memory/hipMemcpyDtoDAsync.cpp @@ -50,25 +50,43 @@ int main() HIPCHECK(hipMalloc(&Y_d,Nbytes)); HIPCHECK(hipMalloc(&Z_d,Nbytes)); - + HIPCHECK(hipSetDevice(0)); HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N); + hipLaunchKernel( + HipTest::vectorADD, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(A_d), + static_cast(B_d), + C_d, + N); HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); HIPCHECK(hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); - - HIPCHECK(hipStreamCreate(&s)); + + HIPCHECK(hipStreamCreate(&s)); HIPCHECK(hipSetDevice(1)); - HIPCHECK(hipMemcpyDtoDAsync(X_d, A_d, Nbytes, s)); + HIPCHECK(hipMemcpyDtoDAsync(X_d, A_d, Nbytes, s)); HIPCHECK(hipMemcpyDtoDAsync(Y_d, B_d, Nbytes, s)); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N); + hipLaunchKernel( + HipTest::vectorADD, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(X_d), + static_cast(Y_d), + Z_d, + N); HIPCHECK(hipMemcpyDtoHAsync(C_h, Z_d, Nbytes, s)); HIPCHECK(hipStreamSynchronize(s)); HIPCHECK(hipDeviceSynchronize()); - + HipTest::checkVectorADD(A_h, B_h, C_h, N); HIPCHECK(hipStreamDestroy(s)); HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); @@ -78,7 +96,7 @@ int main() } passed(); - + } diff --git a/hipamd/tests/src/runtimeApi/memory/hipMemcpyPeer.cpp b/hipamd/tests/src/runtimeApi/memory/hipMemcpyPeer.cpp index 7e2fc2d3d0..95b19c1090 100644 --- a/hipamd/tests/src/runtimeApi/memory/hipMemcpyPeer.cpp +++ b/hipamd/tests/src/runtimeApi/memory/hipMemcpyPeer.cpp @@ -48,24 +48,42 @@ int main() HIPCHECK(hipMalloc(&X_d,Nbytes)); HIPCHECK(hipMalloc(&Y_d,Nbytes)); HIPCHECK(hipMalloc(&Z_d,Nbytes)); - + HIPCHECK(hipSetDevice(0)); HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N); + hipLaunchKernel( + HipTest::vectorADD, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(A_d), + static_cast(B_d), + C_d, + N); HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); HIPCHECK(hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); - + HIPCHECK(hipSetDevice(1)); hipMemcpyPeer(X_d, 1, A_d, 0, Nbytes); //this call is eqv to hipMemcpy(hipMemcpyD2D) which goes via stg bufs. hipMemcpyPeer(Y_d, 1, B_d, 0, Nbytes); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N); + hipLaunchKernel( + HipTest::vectorADD, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(X_d), + static_cast(Y_d), + Z_d, + N); HIPCHECK(hipMemcpy(C_h, Z_d, Nbytes, hipMemcpyDeviceToHost)); HIPCHECK(hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); - + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); HIPCHECK(hipFree(X_d)); HIPCHECK(hipFree(Y_d)); @@ -74,7 +92,7 @@ int main() passed(); - + } diff --git a/hipamd/tests/src/runtimeApi/memory/hipMemcpyPeerAsync.cpp b/hipamd/tests/src/runtimeApi/memory/hipMemcpyPeerAsync.cpp index 9d46ccb0d8..943e4a6b95 100644 --- a/hipamd/tests/src/runtimeApi/memory/hipMemcpyPeerAsync.cpp +++ b/hipamd/tests/src/runtimeApi/memory/hipMemcpyPeerAsync.cpp @@ -51,26 +51,44 @@ int main() HIPCHECK(hipMalloc(&Y_d,Nbytes)); HIPCHECK(hipMalloc(&Z_d,Nbytes)); - + HIPCHECK(hipSetDevice(0)); HIPCHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); HIPCHECK ( hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N); + hipLaunchKernel( + HipTest::vectorADD, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(A_d), + static_cast(B_d), + C_d, + N); HIPCHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); HIPCHECK (hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); - - HIPCHECK(hipStreamCreate(&s)); + + HIPCHECK(hipStreamCreate(&s)); HIPCHECK(hipSetDevice(1)); HIPCHECK(hipMemcpyPeerAsync(X_d, 1, A_d, 0, Nbytes, s)); HIPCHECK(hipMemcpyPeerAsync(Y_d, 1, B_d, 0, Nbytes, s)); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N); + hipLaunchKernel( + HipTest::vectorADD, + dim3(blocks), + dim3(threadsPerBlock), + 0, + 0, + static_cast(X_d), + static_cast(Y_d), + Z_d, + N); HIPCHECK ( hipMemcpy(C_h, Z_d, Nbytes, hipMemcpyDeviceToHost)); HIPCHECK (hipDeviceSynchronize()); HIPCHECK (hipStreamSynchronize(s)); HipTest::checkVectorADD(A_h, B_h, C_h, N); - + HIPCHECK(hipStreamDestroy(s)); HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); HIPCHECK(hipFree(X_d)); @@ -79,7 +97,7 @@ int main() } passed(); - + } From bdd2d6d6024d7494080c3a1be122b78790676345 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Fri, 10 Nov 2017 01:20:50 +0000 Subject: [PATCH 04/27] Add omitted changes in CMakeLists.txt. --- hipamd/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hipamd/CMakeLists.txt b/hipamd/CMakeLists.txt index 1c8f640afb..e405d06ed6 100644 --- a/hipamd/CMakeLists.txt +++ b/hipamd/CMakeLists.txt @@ -167,6 +167,7 @@ if(HIP_PLATFORM STREQUAL "hcc") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${HIP_HCC_BUILD_FLAGS}") set(SOURCE_FILES_RUNTIME + src/code_object_bundle.cpp src/hip_hcc.cpp src/hip_context.cpp src/hip_device.cpp @@ -179,7 +180,8 @@ if(HIP_PLATFORM STREQUAL "hcc") src/hip_db.cpp src/grid_launch.cpp src/hip_texture.cpp - src/env.cpp) + src/env.cpp + src/program_state.cpp) set(SOURCE_FILES_DEVICE src/device_util.cpp From 27bc48763532586af307660d71ff751502c98f5c Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Mon, 20 Nov 2017 16:33:52 +0000 Subject: [PATCH 05/27] Correct ill-formed merge in earlier commit and adjust for differences with the new CUDA natural indexing mechanism. --- hipamd/include/hip/hcc_detail/hip_runtime.h | 15 ++++++++++++--- hipamd/src/grid_launch.cpp | 17 ----------------- 2 files changed, 12 insertions(+), 20 deletions(-) diff --git a/hipamd/include/hip/hcc_detail/hip_runtime.h b/hipamd/include/hip/hcc_detail/hip_runtime.h index 924e774af0..c2302d4dc1 100644 --- a/hipamd/include/hip/hcc_detail/hip_runtime.h +++ b/hipamd/include/hip/hcc_detail/hip_runtime.h @@ -389,9 +389,18 @@ templatelockopen_preKernelCommand()); - - stream->lockclose_postKernelCommand(kernel_name, acc_v); - - delete static_cast(locked_stream); - locked_stream = nullptr; - if(HIP_PROFILE_API) { - MARKER_END(); ->>>>>>> e8ede28ec4f5744185b171031e537237afb7affa } } } From 24f8a93ff7a554ce56774c42ae1cb80fa67c65dc Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Mon, 20 Nov 2017 22:41:46 +0000 Subject: [PATCH 06/27] Clean-up some remaining noise in program_state.cpp. --- hipamd/src/program_state.cpp | 45 +++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/hipamd/src/program_state.cpp b/hipamd/src/program_state.cpp index a4f7fdbdbe..d5e2f80a05 100644 --- a/hipamd/src/program_state.cpp +++ b/hipamd/src/program_state.cpp @@ -17,9 +17,11 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -132,21 +134,39 @@ namespace const auto tmp = find_symbol_address( symbol_section_accessor{self_reader, process_symtab}, x); - assert(tmp.first); + if (!tmp.first) { + throw runtime_error{ + "The global variable: " + x + ", could not be found."}; + } - void* p = nullptr; - hsa_amd_memory_lock( - reinterpret_cast(tmp.first), tmp.second, &agent, 1, &p); + static unordered_map< + Elf64_Addr, + unique_ptr> globals; + + if (globals.count(tmp.first) == 0) { + void* p = nullptr; + hsa_amd_memory_lock( + reinterpret_cast(tmp.first), + tmp.second, + &agent, + 1, + &p); + + static mutex mtx; + + lock_guard lck{mtx}; + globals.emplace( + piecewise_construct, + make_tuple(tmp.first), + make_tuple(p, hsa_amd_memory_unlock)); + } + + const auto it = globals.find(tmp.first); + + assert(it != globals.cend()); hsa_executable_agent_global_variable_define( - executable, agent, x.c_str(), p); - - static vector< - unique_ptr> globals; - static mutex mtx; - - lock_guard lck{mtx}; - globals.emplace_back(p, hsa_amd_memory_unlock); + executable, agent, x.c_str(), it->second.get()); } } @@ -265,7 +285,6 @@ namespace } }); - cout << r.size() << endl; return r; } From f8c1c1b38e7036432ee562ae816a40cafe75b5f2 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Tue, 21 Nov 2017 02:40:34 +0000 Subject: [PATCH 07/27] Refactor the __device__ versions of memset and memcpy to be less awkward i.e. not return nullptr as opposed to the destination pointer (it can only be assumed it was done for maximum confusion) and actually unroll as they claim to. Change all of the {to, from}Symbol functions to use hipModuleGetGlobal, as opposed to hc::accelerator::get_symbol_address which is no longer valid with module based dispatch. --- .../include/hip/hcc_detail/program_state.hpp | 20 +++ hipamd/src/device_util.cpp | 49 ++++++-- hipamd/src/hip_memory.cpp | 24 +++- hipamd/src/hip_module.cpp | 119 +++++++++++++----- hipamd/src/program_state.cpp | 107 +++++++--------- 5 files changed, 206 insertions(+), 113 deletions(-) diff --git a/hipamd/include/hip/hcc_detail/program_state.hpp b/hipamd/include/hip/hcc_detail/program_state.hpp index 03701725eb..0e21b12f5f 100644 --- a/hipamd/include/hip/hcc_detail/program_state.hpp +++ b/hipamd/include/hip/hcc_detail/program_state.hpp @@ -35,6 +35,24 @@ THE SOFTWARE. struct ihipModuleSymbol_t; using hipFunction_t = ihipModuleSymbol_t*; +namespace std +{ + template<> + struct hash { + size_t operator()(hsa_agent_t x) const + { + return hash{}(x.handle); + } + }; +} + +inline +constexpr +bool operator==(hsa_agent_t x, hsa_agent_t y) +{ + return x.handle == y.handle; +} + namespace hip_impl { struct Kernel_descriptor { @@ -50,6 +68,8 @@ namespace hip_impl } }; + const std::unordered_map< + hsa_agent_t, std::vector>& executables(); const std::unordered_map< std::uintptr_t, std::vector>>& functions(); diff --git a/hipamd/src/device_util.cpp b/hipamd/src/device_util.cpp index 367a4c1a4f..b6aebdfce0 100644 --- a/hipamd/src/device_util.cpp +++ b/hipamd/src/device_util.cpp @@ -102,23 +102,48 @@ __device__ void* __hip_hc_free(void *ptr) // loop unrolling __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) { - uint8_t *dstPtr, *srcPtr; - dstPtr = (uint8_t*)dst; - srcPtr = (uint8_t*)src; - for(uint32_t i=0;i(dst); + auto srcPtr = static_cast(src); + + while (size >= 4u) { + dstPtr[0] = srcPtr[0]; + dstPtr[1] = srcPtr[1]; + dstPtr[2] = srcPtr[2]; + dstPtr[3] = srcPtr[3]; + + size -= 4u; + srcPtr += 4u; + dstPtr += 4u; } - return nullptr; + switch (size) { + case 3: dstPtr[2] = srcPtr[2]; + case 2: dstPtr[1] = srcPtr[1]; + case 1: dstPtr[0] = srcPtr[0]; + } + + return dst; } -__device__ void* __hip_hc_memset(void* ptr, uint8_t val, size_t size) +__device__ void* __hip_hc_memset(void* dst, uint8_t val, size_t size) { - uint8_t *dstPtr; - dstPtr = (uint8_t*)ptr; - for(uint32_t i=0;i(dst); + + while (size >= 4u) { + dstPtr[0] = val; + dstPtr[1] = val; + dstPtr[2] = val; + dstPtr[3] = val; + + size -= 4u; + dstPtr += 4u; } - return nullptr; + switch (size) { + case 3: dstPtr[2] = val; + case 2: dstPtr[1] = val; + case 1: dstPtr[0] = val; + } + + return dst; } __device__ float __hip_erfinvf(float x){ diff --git a/hipamd/src/hip_memory.cpp b/hipamd/src/hip_memory.cpp index 047cf76c08..04ea38fcd5 100644 --- a/hipamd/src/hip_memory.cpp +++ b/hipamd/src/hip_memory.cpp @@ -715,7 +715,10 @@ hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t cou hc::accelerator acc = ctx->getDevice()->_acc; - void *dst = acc.get_symbol_address((const char*) symbolName); + hipDeviceptr_t dst = nullptr; + size_t byte_cnt = 0u; + auto status = hipModuleGetGlobal( + &dst, &byte_cnt, 0, static_cast(symbolName)); tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst); if(dst == nullptr) @@ -750,7 +753,10 @@ hipError_t hipMemcpyFromSymbol(void* dst, const void* symbolName, size_t count, hc::accelerator acc = ctx->getDevice()->_acc; - void *src = acc.get_symbol_address((const char*) symbolName); + hipDeviceptr_t src = nullptr; + size_t byte_cnt = 0u; + auto status = hipModuleGetGlobal( + &src, &byte_cnt, 0, static_cast(symbolName)); tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst); if(dst == nullptr) @@ -787,7 +793,10 @@ hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void *src, size_ hc::accelerator acc = ctx->getDevice()->_acc; - void *dst = acc.get_symbol_address((const char*) symbolName); + hipDeviceptr_t dst = nullptr; + size_t byte_cnt = 0u; + auto status = hipModuleGetGlobal( + &dst, &byte_cnt, 0, static_cast(symbolName)); tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst); if(dst == nullptr) @@ -825,7 +834,10 @@ hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName, size_t co hc::accelerator acc = ctx->getDevice()->_acc; - void *src = acc.get_symbol_address((const char*) symbolName); + hipDeviceptr_t src = nullptr; + size_t byte_cnt = 0u; + auto status = hipModuleGetGlobal( + &src, &byte_cnt, 0, static_cast(symbolName)); tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, src); if(src == nullptr || dst == nullptr) @@ -1171,9 +1183,9 @@ namespace __global__ void hip_fill_n(RandomAccessIterator f, N n, T value) { - const uint32_t grid_dim = hipGridDim_x; + const uint32_t grid_dim = gridDim.x * blockDim.x; - size_t idx = hipBlockIdx_x * block_dim + hipThreadIdx_x; + size_t idx = blockIdx.x * block_dim + threadIdx.x; while (idx < n) { new (&f[idx]) T{value}; idx += grid_dim; diff --git a/hipamd/src/hip_module.cpp b/hipamd/src/hip_module.cpp index df847f9f64..fb1cf29df8 100644 --- a/hipamd/src/hip_module.cpp +++ b/hipamd/src/hip_module.cpp @@ -554,16 +554,93 @@ namespace } inline - std::vector read_agent_globals(hipModule_t hmodule) + std::vector read_agent_globals( + hsa_agent_t agent, hsa_executable_t executable) { std::vector r; - hsa_executable_iterate_agent_symbols( - hmodule->executable, this_agent(), copy_agent_global_variables, &r); + executable, agent, copy_agent_global_variables, &r); return r; } + + template + std::pair read_global_description( + ForwardIterator f, ForwardIterator l, const char* name) + { + const auto it = std::find_if( + f, l, [=](const Agent_global& x) { return x.name == name; }); + + return it == l ? + std::make_pair(nullptr, 0u) : + std::make_pair(it->address, it->byte_cnt); + } + + hipError_t read_agent_global_from_module( + hipDeviceptr_t *dptr, + size_t* bytes, + hipModule_t hmod, + const char* name) + { + static std::unordered_map< + hipModule_t, std::vector> agent_globals; + + // TODO: this is not particularly robust. + if (agent_globals.count(hmod) == 0) { + static std::mutex mtx; + std::lock_guard lck{mtx}; + + if (agent_globals.count(hmod) == 0) { + agent_globals.emplace( + hmod, read_agent_globals(this_agent(), hmod->executable)); + } + } + + // TODO: This is unsafe iff some other emplacement triggers rehashing. + // It will have to be properly fleshed out in the future. + const auto it0 = agent_globals.find(hmod); + if (it0 == agent_globals.cend()) { + throw std::runtime_error{"agent_globals data structure corrupted."}; + } + + std::tie(*dptr, *bytes) = read_global_description( + it0->second.cbegin(), it0->second.cend(), name); + + return dptr ? hipSuccess : hipErrorNotFound; + } + + hipError_t read_agent_global_from_process( + hipDeviceptr_t *dptr, size_t* bytes, const char* name) + { + static std::unordered_map< + hsa_agent_t, std::vector> agent_globals; + static std::once_flag f; + + std::call_once(f, []() { + for (auto&& agent_executables : hip_impl::executables()) { + std::vector tmp0; + for (auto&& executable : agent_executables.second) { + auto tmp1 = read_agent_globals( + agent_executables.first, executable); + tmp0.insert( + tmp0.end(), + std::make_move_iterator(tmp1.begin()), + std::make_move_iterator(tmp1.end())); + } + agent_globals.emplace(agent_executables.first, std::move(tmp0)); + } + }); + + const auto it = agent_globals.find(this_agent()); + + if (it == agent_globals.cend()) return hipErrorNotInitialized; + + std::tie(*dptr, *bytes) = read_global_description( + it->second.cbegin(), it->second.cend(), name); + + return dptr ? hipSuccess : hipErrorNotFound; + } } hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, @@ -574,41 +651,15 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, if(dptr == NULL || bytes == NULL){ return ihipLogStatus(hipErrorInvalidValue); } - if(name == NULL || hmod == NULL){ + if(name == NULL){ return ihipLogStatus(hipErrorNotInitialized); } else{ - static std::unordered_map< - hipModule_t, std::vector> agent_globals; + ret = hmod ? + read_agent_global_from_module(dptr, bytes, hmod, name) : + read_agent_global_from_process(dptr, bytes, name); - // TODO: this is not particularly robust. - if (agent_globals.count(hmod) == 0) { - static std::mutex mtx; - std::lock_guard lck{mtx}; - - if (agent_globals.count(hmod) == 0) { - agent_globals.emplace(hmod, read_agent_globals(hmod)); - } - } - - // TODO: This is unsafe iff some other emplacement triggers rehashing. - // It will have to be properly fleshed out in the future. - const auto it0 = agent_globals.find(hmod); - if (it0 == agent_globals.cend()) { - throw std::runtime_error{"agent_globals data structure corrupted."}; - } - - const auto it1 = std::find_if( - it0->second.cbegin(), - it0->second.cend(), - [=](const Agent_global& x) { return x.name == name; }); - - if (it1 == it0->second.cend()) return ihipLogStatus(hipErrorNotFound); - - *dptr = it1->address; - *bytes = it1->byte_cnt; - - return ihipLogStatus(hipSuccess); + return ihipLogStatus(ret); } } diff --git a/hipamd/src/program_state.cpp b/hipamd/src/program_state.cpp index d5e2f80a05..2bb115981b 100644 --- a/hipamd/src/program_state.cpp +++ b/hipamd/src/program_state.cpp @@ -31,14 +31,6 @@ using namespace std; namespace std { - template<> - struct hash { - size_t operator()(hsa_agent_t x) const - { - return hash{}(x.handle); - } - }; - template<> struct hash { size_t operator()(hsa_isa_t x) const @@ -48,13 +40,6 @@ namespace std }; } -inline -constexpr -bool operator==(hsa_agent_t x, hsa_agent_t y) -{ - return x.handle == y.handle; -} - inline constexpr bool operator==(hsa_isa_t x, hsa_isa_t y) @@ -242,52 +227,6 @@ namespace return r; } - const unordered_map>& executables() - { - static unordered_map> r; - static once_flag f; - - call_once(f, []() { - static const auto accelerators = hc::accelerator::get_all(); - - for (auto&& acc : accelerators) { - auto agent = static_cast(acc.get_hsa_agent()); - - if (!agent) continue; - - hsa_agent_iterate_isas(*agent, [](hsa_isa_t x, void* pa) { - const auto it = code_object_blobs().find(x); - - if (it != code_object_blobs().cend()) { - hsa_agent_t a = *static_cast(pa); - - for (auto&& blob : it->second) { - hsa_executable_t tmp = {}; - - hsa_executable_create_alt( - HSA_PROFILE_FULL, - HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, - nullptr, - &tmp); - - // TODO: this is massively inefficient and only - // meant for illustration. - string blob_to_str{blob.cbegin(), blob.cend()}; - stringstream istr{blob_to_str}; - tmp = load_executable(tmp, a, istr); - - if (tmp.handle) r[a].push_back(tmp); - } - } - - return HSA_STATUS_SUCCESS; - }, agent); - } - }); - - return r; - } - vector> function_names_for( const elfio& reader, section* symtab) { @@ -467,6 +406,52 @@ namespace namespace hip_impl { + const unordered_map>& executables() + { + static unordered_map> r; + static once_flag f; + + call_once(f, []() { + static const auto accelerators = hc::accelerator::get_all(); + + for (auto&& acc : accelerators) { + auto agent = static_cast(acc.get_hsa_agent()); + + if (!agent) continue; + + hsa_agent_iterate_isas(*agent, [](hsa_isa_t x, void* pa) { + const auto it = code_object_blobs().find(x); + + if (it != code_object_blobs().cend()) { + hsa_agent_t a = *static_cast(pa); + + for (auto&& blob : it->second) { + hsa_executable_t tmp = {}; + + hsa_executable_create_alt( + HSA_PROFILE_FULL, + HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, + nullptr, + &tmp); + + // TODO: this is massively inefficient and only + // meant for illustration. + string blob_to_str{blob.cbegin(), blob.cend()}; + stringstream istr{blob_to_str}; + tmp = load_executable(tmp, a, istr); + + if (tmp.handle) r[a].push_back(tmp); + } + } + + return HSA_STATUS_SUCCESS; + }, agent); + } + }); + + return r; + } + const unordered_map& function_names() { static unordered_map r{ From fb1021cc0a98cd0e501e971284cbbb847941cfd1 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Tue, 21 Nov 2017 13:15:13 +0000 Subject: [PATCH 08/27] This corrects how addresses are formed for symbols which reside in shared objects. For this case, the .value component of an ELF symbol holds the offset from the base VA where the shared object was loaded. Thus, to correctly obtain the VA of the object refered by the symbol, we must add the offset to the VA where the shared object is loaded. We were already doing this correctly for symbols denoting functions, but we were incorrect for those denoting objects. --- hipamd/src/program_state.cpp | 191 +++++++++++++++++++---------------- 1 file changed, 106 insertions(+), 85 deletions(-) diff --git a/hipamd/src/program_state.cpp b/hipamd/src/program_state.cpp index 2bb115981b..79d692e06f 100644 --- a/hipamd/src/program_state.cpp +++ b/hipamd/src/program_state.cpp @@ -49,6 +49,38 @@ bool operator==(hsa_isa_t x, hsa_isa_t y) namespace { + struct Symbol { + std::string name; + ELFIO::Elf64_Addr value = 0; + ELFIO::Elf_Xword size = 0; + ELFIO::Elf_Half sect_idx = 0; + std::uint8_t bind = 0; + std::uint8_t type = 0; + std::uint8_t other = 0; + }; + + inline + Symbol read_symbol(const symbol_section_accessor& section, unsigned int idx) + { + assert(idx < section.get_symbols_num()); + + Symbol r; + section.get_symbol( + idx, r.name, r.value, r.size, r.bind, r.type, r.sect_idx, r.other); + + return r; + } + + template + inline + section* find_section_if(elfio& reader, P p) + { + const auto it = find_if( + reader.sections.begin(), reader.sections.end(), move(p)); + + return it != reader.sections.end() ? *it : nullptr; + } + vector copy_names_of_undefined_symbols( const symbol_section_accessor& section) { @@ -57,47 +89,57 @@ namespace for (auto i = 0u; i != section.get_symbols_num(); ++i) { // TODO: this is boyscout code, caching the temporaries // may be of worth. - string name; - Elf64_Addr value = 0; - Elf_Xword size = 0; - Elf_Half sect_idx = 0; - uint8_t bind = 0; - uint8_t type = 0; - uint8_t other = 0; - section.get_symbol( - i, name, value, size, bind, type, sect_idx, other); - - if (sect_idx == SHN_UNDEF && !name.empty()) { - r.push_back(std::move(name)); + auto tmp = read_symbol(section, i); + if (tmp.sect_idx == SHN_UNDEF && !tmp.name.empty()) { + r.push_back(std::move(tmp.name)); } } return r; } - pair find_symbol_address( - const symbol_section_accessor& section, - const string& symbol_name) + const std::unordered_map< + std::string, + std::pair>& symbol_addresses() { - static constexpr pair r{0, 0}; + static unordered_map> r; + static once_flag f; - for (auto i = 0u; i != section.get_symbols_num(); ++i) { - // TODO: this is boyscout code, caching the temporaries - // may be of worth. - string name; - Elf64_Addr value = 0; - Elf_Xword size = 0; - Elf_Half sect_idx = 0; - uint8_t bind = 0; - uint8_t type = 0; - uint8_t other = 0; + call_once(f, []() { + dl_iterate_phdr([](dl_phdr_info* info, size_t, void*) { + static constexpr const char self[] = "/proc/self/exe"; + elfio reader; - section.get_symbol( - i, name, value, size, bind, type, sect_idx, other); + static unsigned int iter = 0u; + if (reader.load(!iter ? self : info->dlpi_name)) { + auto it = find_section_if( + reader, [](const class section* x) { + return x->get_type() == SHT_SYMTAB; + }); - if (name == symbol_name) return make_pair(value, size); - } + if (it) { + const symbol_section_accessor symtab{reader, it}; + + for (auto i = 0u; i != symtab.get_symbols_num(); ++i) { + auto tmp = read_symbol(symtab, i); + + if (tmp.type == STT_OBJECT && + tmp.sect_idx != SHN_UNDEF) { + const auto addr = + tmp.value + (iter ? info->dlpi_addr : 0); + r.emplace( + move(tmp.name), make_pair(addr, tmp.size)); + } + } + } + + ++iter; + } + + return 0; + }, nullptr); + }); return r; } @@ -116,55 +158,43 @@ namespace symbol_section_accessor{reader, code_object_dynsym}); for (auto&& x : undefined_symbols) { - const auto tmp = find_symbol_address( - symbol_section_accessor{self_reader, process_symtab}, x); + using RAII_global = + unique_ptr; - if (!tmp.first) { - throw runtime_error{ - "The global variable: " + x + ", could not be found."}; + static unordered_map globals; + static once_flag f; + call_once(f, [=]() { globals.reserve(symbol_addresses().size()); }); + + if (globals.find(x) != globals.cend()) return; + + const auto it1 = symbol_addresses().find(x); + + if (it1 == symbol_addresses().cend()) { + throw runtime_error{"Global symbol: " + x + " is undefined."}; } - static unordered_map< - Elf64_Addr, - unique_ptr> globals; + static mutex mtx; + lock_guard lck{mtx}; - if (globals.count(tmp.first) == 0) { - void* p = nullptr; - hsa_amd_memory_lock( - reinterpret_cast(tmp.first), - tmp.second, - &agent, - 1, - &p); + if (globals.find(x) != globals.cend()) return; - static mutex mtx; + void* p = nullptr; + hsa_amd_memory_lock( + reinterpret_cast(it1->second.first), + it1->second.second, + nullptr, // All agents. + 0, + &p); - lock_guard lck{mtx}; - globals.emplace( - piecewise_construct, - make_tuple(tmp.first), - make_tuple(p, hsa_amd_memory_unlock)); - } - - const auto it = globals.find(tmp.first); - - assert(it != globals.cend()); + if (!p) { cerr << it1->first << endl; assert(false); } hsa_executable_agent_global_variable_define( - executable, agent, x.c_str(), it->second.get()); + executable, agent, x.c_str(), p); + + globals.emplace(x, RAII_global{p, hsa_amd_memory_unlock}); } } - template - inline - section* find_section_if(elfio& reader, P p) - { - const auto it = find_if( - reader.sections.begin(), reader.sections.end(), std::move(p)); - - return it != reader.sections.end() ? *it : nullptr; - } - vector code_object_blob_for_process() { static constexpr const char self[] = "/proc/self/exe"; @@ -217,8 +247,8 @@ namespace Bundled_code_header tmp{blob}; if (valid(tmp)) { for (auto&& bundle : bundles(tmp)) { - r[triple_to_hsa_isa(bundle.triple)] - .push_back(bundle.blob); + r[triple_to_hsa_isa(bundle.triple)].push_back( + bundle.blob); } } } @@ -233,24 +263,15 @@ namespace vector> r; symbol_section_accessor symbols{reader, symtab}; - auto foo = reader.get_entry(); - for (auto i = 0u; i != symbols.get_symbols_num(); ++i) { // TODO: this is boyscout code, caching the temporaries // may be of worth. - string name; - Elf64_Addr value = 0; - Elf_Xword size = 0; - Elf_Half sect_idx = 0; - uint8_t bind = 0; - uint8_t type = 0; - uint8_t other = 0; + auto tmp = read_symbol(symbols, i); - symbols.get_symbol( - i, name, value, size, bind, type, sect_idx, other); - - if (type == STT_FUNC && sect_idx != SHN_UNDEF && !name.empty()) { - r.emplace_back(value, name); + if (tmp.type == STT_FUNC && + tmp.sect_idx != SHN_UNDEF && + !tmp.name.empty()) { + r.emplace_back(tmp.value, tmp.name); } } @@ -417,7 +438,7 @@ namespace hip_impl for (auto&& acc : accelerators) { auto agent = static_cast(acc.get_hsa_agent()); - if (!agent) continue; + if (!agent || !acc.is_hsa_accelerator()) continue; hsa_agent_iterate_isas(*agent, [](hsa_isa_t x, void* pa) { const auto it = code_object_blobs().find(x); From 0755f1fc26850c2f0efec0ad386dcff2ecf621bc Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Tue, 21 Nov 2017 17:52:01 +0000 Subject: [PATCH 09/27] Modify the set component of the memcpy test (unclear why there is a memset component to begin with). --- hipamd/tests/src/deviceLib/hipDeviceMemcpy.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hipamd/tests/src/deviceLib/hipDeviceMemcpy.cpp b/hipamd/tests/src/deviceLib/hipDeviceMemcpy.cpp index 46656a434c..e845ae8f2f 100644 --- a/hipamd/tests/src/deviceLib/hipDeviceMemcpy.cpp +++ b/hipamd/tests/src/deviceLib/hipDeviceMemcpy.cpp @@ -23,7 +23,7 @@ __global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In) __global__ void set(hipLaunchParm lp, uint32_t *ptr, uint8_t val, size_t size) { int tx = threadIdx.x; - memset(ptr + tx, val, (sizeof(uint32_t)*(size/LEN))); + memset(ptr + tx, val, sizeof(uint32_t)); } int main() From dfa532db98c9badbeb11f89e77a98478727c1d12 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 22 Nov 2017 19:37:03 +0000 Subject: [PATCH 10/27] Remove leftover comment. --- hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp b/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp index e3fa3331ac..7a9500f4d6 100644 --- a/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp +++ b/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp @@ -125,7 +125,7 @@ void hipLaunchKernelGGL( std::size_t kernarg_size = kernarg.size(); void* config[] = { - HIP_LAUNCH_PARAM_BUFFER_POINTER, kernarg.data(),//&kernarg, + HIP_LAUNCH_PARAM_BUFFER_POINTER, kernarg.data(), HIP_LAUNCH_PARAM_BUFFER_SIZE, &kernarg_size, HIP_LAUNCH_PARAM_END }; From 7761f63bbc69596d078f42dac9833a5c72e2bd33 Mon Sep 17 00:00:00 2001 From: Chris Kitching Date: Mon, 13 Nov 2017 17:20:07 +0000 Subject: [PATCH 11/27] Add hipify mappings for all CUDA headers that have HIP equivalents I'm particularly running into issues with `device_types.h` in real CUDA code... --- hipamd/hipify-clang/src/CUDA2HipMap.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/hipamd/hipify-clang/src/CUDA2HipMap.cpp b/hipamd/hipify-clang/src/CUDA2HipMap.cpp index de6ddb2d74..b2e5251139 100644 --- a/hipamd/hipify-clang/src/CUDA2HipMap.cpp +++ b/hipamd/hipify-clang/src/CUDA2HipMap.cpp @@ -312,9 +312,16 @@ const std::map CUDA_TYPE_NAME_MAP{ /// Maps cuda header names to hip header names. const std::map CUDA_INCLUDE_MAP{ // CUDA includes - {"cuda.h", {"hip/hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H, API_DRIVER}}, - {"cuda_runtime.h", {"hip/hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H, API_RUNTIME}}, - {"cuda_runtime_api.h", {"hip/hip_runtime_api.h", CONV_INCLUDE, API_RUNTIME}}, + {"cuda.h", {"hip/hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H, API_DRIVER}}, + {"cuda_runtime.h", {"hip/hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H, API_RUNTIME}}, + {"cuda_runtime_api.h", {"hip/hip_runtime_api.h", CONV_INCLUDE, API_RUNTIME}}, + {"channel_descriptor.h", {"hip/channel_descriptor.h", CONV_INCLUDE, API_RUNTIME}}, + {"device_functions.h", {"hip/device_functions.h", CONV_INCLUDE, API_RUNTIME}}, + {"driver_types.h", {"hip/driver_types.h", CONV_INCLUDE, API_RUNTIME}}, + {"cuComplex.h", {"hip/hip_complex.h", CONV_INCLUDE, API_RUNTIME}}, + {"cuda_fp16.h", {"hip/hip_fp16.h", CONV_INCLUDE, API_RUNTIME}}, + {"cuda_texture_types.h", {"hip/hip_texture_types.h", CONV_INCLUDE, API_RUNTIME}}, + {"vector_types.h", {"hip/hip_vector_types.h", CONV_INCLUDE, API_RUNTIME}}, // CUBLAS includes {"cublas.h", {"hipblas.h", CONV_INCLUDE, API_BLAS}}, From 4b7cb0624e6175f3d9d1104afef0cde1f0faff72 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Fri, 17 Nov 2017 16:00:28 +0000 Subject: [PATCH 12/27] This fixes some outright quaint choices made when implementing HIP's bitwise conversion functions, by using simple reinterpret_casts, as is idiomatic. These functions are supposed to be re-entrant, correct and efficient. Sadly, they were neither: they hid a massive race condition against a value stored in global memory, which means that they were also unreasonably slow if they ever managed to be correct, and relied on union based type punning which is in a grey area of the standard. It is difficult to ascertain what may have been the reason for coming up with this quirky solution. --- hipamd/src/device_functions.cpp | 53 ++++++++------------------------- 1 file changed, 12 insertions(+), 41 deletions(-) diff --git a/hipamd/src/device_functions.cpp b/hipamd/src/device_functions.cpp index 615ae4d0b7..a66cc1e9fb 100644 --- a/hipamd/src/device_functions.cpp +++ b/hipamd/src/device_functions.cpp @@ -23,27 +23,6 @@ THE SOFTWARE. #include #include "device_util.h" -struct holder64Bit{ - union{ - double d; - unsigned long int uli; - signed long int sli; - signed int si[2]; - unsigned int ui[2]; - }; -} __attribute__((aligned(8))); - -struct holder32Bit { - union { - float f; - unsigned int ui; - signed int si; - }; -} __attribute__((aligned(4))); - -__device__ struct holder64Bit hold64; -__device__ struct holder32Bit hold32; - __device__ float __double2float_rd(double x) { return (double)x; @@ -64,13 +43,11 @@ __device__ float __double2float_rz(double x) __device__ int __double2hiint(double x) { - hold64.d = x; - return hold64.si[1]; + return reinterpret_cast(x)[1]; } __device__ int __double2loint(double x) { - hold64.d = x; - return hold64.si[0]; + return reinterpret_cast(x)[0]; } @@ -145,8 +122,7 @@ __device__ unsigned long long int __double2ull_rz(double x) __device__ long long int __double_as_longlong(double x) { - hold64.d = x; - return hold64.sli; + return reinterpret_cast(x); } __device__ int __float2int_rd(float x) @@ -219,19 +195,17 @@ __device__ unsigned long long int __float2ull_rz(float x) __device__ int __float_as_int(float x) { - hold32.f = x; - return hold32.si; + return reinterpret_cast(x); } __device__ unsigned int __float_as_uint(float x) { - hold32.f = x; - return hold32.ui; + return reinterpret_cast(x); } __device__ double __hiloint2double(int hi, int lo) -{ - hold64.si[1] = hi; - hold64.si[0] = lo; - return hold64.d; +{ // TODO: this matches the original in not considering endianness, is that + // correct though? + int tmp[] = {lo, hi}; + return reinterpret_cast(tmp); } __device__ double __int2double_rn(int x) { @@ -257,8 +231,7 @@ __device__ float __int2float_rz(int x) __device__ float __int_as_float(int x) { - hold32.si = x; - return hold32.f; + return reinterpret_cast(x); } __device__ double __ll2double_rd(long long int x) @@ -297,8 +270,7 @@ __device__ float __ll2float_rz(long long int x) __device__ double __longlong_as_double(long long int x) { - hold64.sli = x; - return hold64.d; + return reinterpret_cast(x); } __device__ double __uint2double_rn(int x) @@ -325,8 +297,7 @@ __device__ float __uint2float_rz(unsigned int x) __device__ float __uint_as_float(unsigned int x) { - hold32.ui = x; - return hold32.f; + return reinterpret_cast(x); } __device__ double __ull2double_rd(unsigned long long int x) From bb35299560c023fec1dd96529da643c545f6237f Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Sat, 18 Nov 2017 01:16:31 +0000 Subject: [PATCH 13/27] This actually (tries) to do the right thing all the way, by using memcpy for bitcasting, and not rely on undefined behaviour of a different flavour as a substitute for the original undefined behaviour. Note that the compiler will (should) optimise down to the same emitted code, since this is a pattern it understands. --- hipamd/src/device_functions.cpp | 71 +++++++++++++++++++++++++++------ 1 file changed, 58 insertions(+), 13 deletions(-) diff --git a/hipamd/src/device_functions.cpp b/hipamd/src/device_functions.cpp index a66cc1e9fb..63425bc9f4 100644 --- a/hipamd/src/device_functions.cpp +++ b/hipamd/src/device_functions.cpp @@ -43,11 +43,21 @@ __device__ float __double2float_rz(double x) __device__ int __double2hiint(double x) { - return reinterpret_cast(x)[1]; + static_assert(sizeof(double) == 2 * sizeof(int), ""); + + int tmp[2]; + __builtin_memcpy(tmp, &x, sizeof(tmp)); + + return tmp[1]; } __device__ int __double2loint(double x) { - return reinterpret_cast(x)[0]; + static_assert(sizeof(double) == 2 * sizeof(int), ""); + + int tmp[2]; + __builtin_memcpy(tmp, &x, sizeof(tmp)); + + return tmp[0]; } @@ -122,7 +132,12 @@ __device__ unsigned long long int __double2ull_rz(double x) __device__ long long int __double_as_longlong(double x) { - return reinterpret_cast(x); + static_assert(sizeof(long long) == sizeof(double), ""); + + long long tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return tmp; } __device__ int __float2int_rd(float x) @@ -195,17 +210,32 @@ __device__ unsigned long long int __float2ull_rz(float x) __device__ int __float_as_int(float x) { - return reinterpret_cast(x); + static_assert(sizeof(int) == sizeof(float), ""); + + int tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return tmp; } __device__ unsigned int __float_as_uint(float x) { - return reinterpret_cast(x); + static_assert(sizeof(unsigned int) == sizeof(float), ""); + + unsigned int tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return tmp; } -__device__ double __hiloint2double(int hi, int lo) -{ // TODO: this matches the original in not considering endianness, is that - // correct though? - int tmp[] = {lo, hi}; - return reinterpret_cast(tmp); +__device__ double __hiloint2double(int32_t hi, int32_t lo) +{ + static_assert(sizeof(double) == sizeof(uint64_t), ""); + + uint64_t tmp0 = + (static_cast(hi) << 32ull) | static_cast(lo); + double tmp1; + __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + + return tmp1; } __device__ double __int2double_rn(int x) { @@ -231,7 +261,12 @@ __device__ float __int2float_rz(int x) __device__ float __int_as_float(int x) { - return reinterpret_cast(x); + static_assert(sizeof(float) == sizeof(int), ""); + + float tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return tmp; } __device__ double __ll2double_rd(long long int x) @@ -270,7 +305,12 @@ __device__ float __ll2float_rz(long long int x) __device__ double __longlong_as_double(long long int x) { - return reinterpret_cast(x); + static_assert(sizeof(double) == sizeof(long long), ""); + + double tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return x; } __device__ double __uint2double_rn(int x) @@ -297,7 +337,12 @@ __device__ float __uint2float_rz(unsigned int x) __device__ float __uint_as_float(unsigned int x) { - return reinterpret_cast(x); + static_assert(sizeof(float) == sizeof(unsigned int), ""); + + float tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return tmp; } __device__ double __ull2double_rd(unsigned long long int x) From cc418da6542d601ac7d505f88472ff94183c2812 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Thu, 23 Nov 2017 09:57:24 +0530 Subject: [PATCH 14/27] Fix float2int rounding functions Change-Id: I67943859a6344c5eec0eaa23418c9b802ef72468 --- hipamd/src/device_functions.cpp | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/hipamd/src/device_functions.cpp b/hipamd/src/device_functions.cpp index 63425bc9f4..6f91e3c939 100644 --- a/hipamd/src/device_functions.cpp +++ b/hipamd/src/device_functions.cpp @@ -23,6 +23,27 @@ THE SOFTWARE. #include #include "device_util.h" +struct holder64Bit{ + union{ + double d; + unsigned long int uli; + signed long int sli; + signed int si[2]; + unsigned int ui[2]; + }; +} __attribute__((aligned(8))); + +struct holder32Bit { + union { + float f; + unsigned int ui; + signed int si; + }; +} __attribute__((aligned(4))); + +__device__ struct holder64Bit hold64; +__device__ struct holder32Bit hold32; + __device__ float __double2float_rd(double x) { return (double)x; From 6a0efb7ed253dc7f8dfa3a1750c6a7e527139ed3 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Tue, 28 Nov 2017 19:15:29 +0000 Subject: [PATCH 15/27] Re-sync with upstream and re-factor platform global management for texture references. --- .../include/hip/hcc_detail/program_state.hpp | 4 ++++ hipamd/src/hip_module.cpp | 6 ++--- hipamd/src/program_state.cpp | 24 +++++++++---------- 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/hipamd/include/hip/hcc_detail/program_state.hpp b/hipamd/include/hip/hcc_detail/program_state.hpp index 0e21b12f5f..65896e97a7 100644 --- a/hipamd/include/hip/hcc_detail/program_state.hpp +++ b/hipamd/include/hip/hcc_detail/program_state.hpp @@ -23,6 +23,7 @@ THE SOFTWARE. #pragma once #include +#include #include #include @@ -68,12 +69,15 @@ namespace hip_impl } }; + using RAII_global = std::unique_ptr; + const std::unordered_map< hsa_agent_t, std::vector>& executables(); const std::unordered_map< std::uintptr_t, std::vector>>& functions(); const std::unordered_map& function_names(); + std::unordered_map& globals(); hsa_executable_t load_executable( hsa_executable_t executable, hsa_agent_t agent, std::istream& file); diff --git a/hipamd/src/hip_module.cpp b/hipamd/src/hip_module.cpp index d8fa2db097..00ffd8b03b 100644 --- a/hipamd/src/hip_module.cpp +++ b/hipamd/src/hip_module.cpp @@ -725,9 +725,9 @@ hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const if(name == NULL || hmod == NULL){ ret = hipErrorNotInitialized; } else{ - const auto it = hmod->coGlobals.find(name); - if (it == hmod->coGlobals.end()) return ihipLogStatus(hipErrorInvalidValue); - *texRef = reinterpret_cast(it->second); + const auto it = hip_impl::globals().find(name); + if (it == hip_impl::globals().end()) return ihipLogStatus(hipErrorInvalidValue); + *texRef = reinterpret_cast(it->second.get()); ret = hipSuccess; } } diff --git a/hipamd/src/program_state.cpp b/hipamd/src/program_state.cpp index 79d692e06f..61c90556be 100644 --- a/hipamd/src/program_state.cpp +++ b/hipamd/src/program_state.cpp @@ -158,14 +158,7 @@ namespace symbol_section_accessor{reader, code_object_dynsym}); for (auto&& x : undefined_symbols) { - using RAII_global = - unique_ptr; - - static unordered_map globals; - static once_flag f; - call_once(f, [=]() { globals.reserve(symbol_addresses().size()); }); - - if (globals.find(x) != globals.cend()) return; + if (globals().find(x) != globals().cend()) return; const auto it1 = symbol_addresses().find(x); @@ -176,7 +169,7 @@ namespace static mutex mtx; lock_guard lck{mtx}; - if (globals.find(x) != globals.cend()) return; + if (globals().find(x) != globals().cend()) return; void* p = nullptr; hsa_amd_memory_lock( @@ -186,12 +179,10 @@ namespace 0, &p); - if (!p) { cerr << it1->first << endl; assert(false); } - hsa_executable_agent_global_variable_define( executable, agent, x.c_str(), p); - globals.emplace(x, RAII_global{p, hsa_amd_memory_unlock}); + globals().emplace(x, RAII_global{p, hsa_amd_memory_unlock}); } } @@ -534,6 +525,15 @@ namespace hip_impl return r; } + unordered_map& globals() + { + static unordered_map r; + static once_flag f; + call_once(f, []() { r.reserve(symbol_addresses().size()); }); + + return r; + } + hsa_executable_t load_executable( hsa_executable_t executable, hsa_agent_t agent, istream& file) { From d5c8de3f41a5e02d3284f4359805cec694e66a75 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Tue, 28 Nov 2017 19:45:47 +0000 Subject: [PATCH 16/27] Change memset kernel to use memcpy instead of placement new. Simplify indexers. --- hipamd/include/hip/hcc_detail/hip_runtime.h | 15 +++------------ hipamd/src/hip_memory.cpp | 5 ++++- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/hipamd/include/hip/hcc_detail/hip_runtime.h b/hipamd/include/hip/hcc_detail/hip_runtime.h index c2302d4dc1..924e774af0 100644 --- a/hipamd/include/hip/hcc_detail/hip_runtime.h +++ b/hipamd/include/hip/hcc_detail/hip_runtime.h @@ -389,18 +389,9 @@ template(&f[idx]), + reinterpret_cast(&value), + sizeof(T)); idx += grid_dim; } } From 45a9f4f7b1d42dc52377961a34de7f5b10c56df8 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Tue, 28 Nov 2017 19:56:04 +0000 Subject: [PATCH 17/27] Remove leftover agent allocated globals. --- hipamd/src/device_functions.cpp | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/hipamd/src/device_functions.cpp b/hipamd/src/device_functions.cpp index b888bde5d5..396b5b0594 100644 --- a/hipamd/src/device_functions.cpp +++ b/hipamd/src/device_functions.cpp @@ -28,27 +28,6 @@ extern "C" float __ocml_rint_f32(float); extern "C" float __ocml_ceil_f32(float); extern "C" float __ocml_trunc_f32(float); -struct holder64Bit{ - union{ - double d; - unsigned long int uli; - signed long int sli; - signed int si[2]; - unsigned int ui[2]; - }; -} __attribute__((aligned(8))); - -struct holder32Bit { - union { - float f; - unsigned int ui; - signed int si; - }; -} __attribute__((aligned(4))); - -__device__ struct holder64Bit hold64; -__device__ struct holder32Bit hold32; - __device__ float __double2float_rd(double x) { return (double)x; From c6ab6f292bffd3de9034402f910e3a51f9647ea8 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 29 Nov 2017 00:17:44 +0000 Subject: [PATCH 18/27] Choose whether or not to use functional grid_launch based on the version of HCC used to compile. --- .../hip/hcc_detail/code_object_bundle.hpp | 22 + hipamd/include/hip/hcc_detail/concepts.hpp | 2 +- .../hip/hcc_detail/functional_grid_launch.hpp | 159 +++ .../hip/hcc_detail/grid_launch_GGL.hpp | 146 +-- .../hcc_detail/macro_based_grid_launch.hpp | 1004 +++++++++++++++++ hipamd/src/functional_grid_launch.inl | 138 +++ hipamd/src/grid_launch.cpp | 121 +- hipamd/src/hip_module.cpp | 1 + hipamd/src/macro_based_grid_launch.inl | 99 ++ 9 files changed, 1436 insertions(+), 256 deletions(-) create mode 100644 hipamd/include/hip/hcc_detail/functional_grid_launch.hpp create mode 100644 hipamd/include/hip/hcc_detail/macro_based_grid_launch.hpp create mode 100644 hipamd/src/functional_grid_launch.inl create mode 100644 hipamd/src/macro_based_grid_launch.inl diff --git a/hipamd/include/hip/hcc_detail/code_object_bundle.hpp b/hipamd/include/hip/hcc_detail/code_object_bundle.hpp index 080132c561..05ba44fcc8 100644 --- a/hipamd/include/hip/hcc_detail/code_object_bundle.hpp +++ b/hipamd/include/hip/hcc_detail/code_object_bundle.hpp @@ -1,3 +1,25 @@ +/* +Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + #pragma once #include diff --git a/hipamd/include/hip/hcc_detail/concepts.hpp b/hipamd/include/hip/hcc_detail/concepts.hpp index 5c50f5d577..18c1119b73 100644 --- a/hipamd/include/hip/hcc_detail/concepts.hpp +++ b/hipamd/include/hip/hcc_detail/concepts.hpp @@ -1,5 +1,5 @@ /* -Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/hipamd/include/hip/hcc_detail/functional_grid_launch.hpp b/hipamd/include/hip/hcc_detail/functional_grid_launch.hpp new file mode 100644 index 0000000000..bbffae52e8 --- /dev/null +++ b/hipamd/include/hip/hcc_detail/functional_grid_launch.hpp @@ -0,0 +1,159 @@ +/* +Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#include "code_object_bundle.hpp" +#include "concepts.hpp" +#include "helpers.hpp" +#include "program_state.hpp" + +#include "hc.hpp" +#include "hip/hip_hcc.h" +#include "hip_runtime.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace hip_impl +{ + template< + typename T, + typename std::enable_if{}>::type* = nullptr> + inline + T round_up_to_next_multiple_nonnegative(T x, T y) + { + T tmp = x + y - 1; + return tmp - tmp % y; + } + + inline + std::vector make_kernarg() + { + return {}; + } + + inline + std::vector make_kernarg(std::vector kernarg) + { + return kernarg; + } + + template + inline + std::vector make_kernarg(std::vector kernarg, T x) + { + kernarg.resize( + round_up_to_next_multiple_nonnegative(kernarg.size(), alignof(T)) + + sizeof(T)); + + new (kernarg.data() + kernarg.size() - sizeof(T)) T{std::move(x)}; + + return kernarg; + } + + template + inline + std::vector make_kernarg( + std::vector kernarg, T x, Ts... xs) + { + return make_kernarg( + make_kernarg(std::move(kernarg), std::move(x)), std::move(xs)...); + } + + template + inline + std::vector make_kernarg(Ts... xs) + { + std::vector kernarg; + kernarg.reserve(sizeof(std::tuple)); + + return make_kernarg(std::move(kernarg), std::move(xs)...); + } + + void hipLaunchKernelGGLImpl( + std::uintptr_t function_address, + const dim3& numBlocks, + const dim3& dimBlocks, + std::uint32_t sharedMemBytes, + hipStream_t stream, + void** kernarg); +} // Namespace hip_impl. + +template +inline +void hipLaunchKernelGGL( + F kernel, + const dim3& numBlocks, + const dim3& dimBlocks, + std::uint32_t sharedMemBytes, + hipStream_t stream, + Args... args) +{ + auto kernarg = hip_impl::make_kernarg(std::move(args)...); + std::size_t kernarg_size = kernarg.size(); + + void* config[] = { + HIP_LAUNCH_PARAM_BUFFER_POINTER, kernarg.data(), + HIP_LAUNCH_PARAM_BUFFER_SIZE, &kernarg_size, + HIP_LAUNCH_PARAM_END + }; + + hip_impl::hipLaunchKernelGGLImpl( + reinterpret_cast(kernel), + numBlocks, + dimBlocks, + sharedMemBytes, + stream, + &config[0]); +} + +template +inline +void hipLaunchKernel( + F kernel, + const dim3& numBlocks, + const dim3& dimBlocks, + std::uint32_t groupMemBytes, + hipStream_t stream, + Args... args) +{ + hipLaunchKernelGGL( + kernel, + numBlocks, + dimBlocks, + groupMemBytes, + stream, + hipLaunchParm{}, + std::move(args)...); +} + diff --git a/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp b/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp index 7a9500f4d6..187d84dbff 100644 --- a/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp +++ b/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp @@ -20,143 +20,11 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#pragma once - #if GENERIC_GRID_LAUNCH == 1 - -#include "code_object_bundle.hpp" -#include "concepts.hpp" -#include "helpers.hpp" -#include "program_state.hpp" - -#include "hc.hpp" -#include "hip/hip_hcc.h" -#include "hip_runtime.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace hip_impl -{ - template< - typename T, - typename std::enable_if{}>::type* = nullptr> - inline - T round_up_to_next_multiple_nonnegative(T x, T y) - { - T tmp = x + y - 1; - return tmp - tmp % y; - } - - inline - std::vector make_kernarg() - { - return {}; - } - - inline - std::vector make_kernarg(std::vector kernarg) - { - return kernarg; - } - - template - inline - std::vector make_kernarg(std::vector kernarg, T x) - { - kernarg.resize( - round_up_to_next_multiple_nonnegative(kernarg.size(), alignof(T)) + - sizeof(T)); - - new (kernarg.data() + kernarg.size() - sizeof(T)) T{std::move(x)}; - - return kernarg; - } - - template - inline - std::vector make_kernarg( - std::vector kernarg, T x, Ts... xs) - { - return make_kernarg( - make_kernarg(std::move(kernarg), std::move(x)), std::move(xs)...); - } - - template - inline - std::vector make_kernarg(Ts... xs) - { - std::vector kernarg; - kernarg.reserve(sizeof(std::tuple)); - - return make_kernarg(std::move(kernarg), std::move(xs)...); - } - - void hipLaunchKernelGGLImpl( - std::uintptr_t function_address, - const dim3& numBlocks, - const dim3& dimBlocks, - std::uint32_t sharedMemBytes, - hipStream_t stream, - void** kernarg); -} // Namespace hip_impl. - -template -inline -void hipLaunchKernelGGL( - F kernel, - const dim3& numBlocks, - const dim3& dimBlocks, - std::uint32_t sharedMemBytes, - hipStream_t stream, - Args... args) -{ - auto kernarg = hip_impl::make_kernarg(std::move(args)...); - std::size_t kernarg_size = kernarg.size(); - - void* config[] = { - HIP_LAUNCH_PARAM_BUFFER_POINTER, kernarg.data(), - HIP_LAUNCH_PARAM_BUFFER_SIZE, &kernarg_size, - HIP_LAUNCH_PARAM_END - }; - - hip_impl::hipLaunchKernelGGLImpl( - reinterpret_cast(kernel), - numBlocks, - dimBlocks, - sharedMemBytes, - stream, - &config[0]); -} - -template -inline -void hipLaunchKernel( - F kernel, - const dim3& numBlocks, - const dim3& dimBlocks, - std::uint32_t groupMemBytes, - hipStream_t stream, - Args... args) -{ - hipLaunchKernelGGL( - kernel, - numBlocks, - dimBlocks, - groupMemBytes, - stream, - hipLaunchParm{}, - std::move(args)...); -} - -#endif //GENERIC_GRID_LAUNCH + #if __hcc_workweek__ >= 17481 + #define FUNCTIONAL_GRID_LAUNCH + #include "functional_grid_launch.hpp" + #else + #include "macro_based_grid_launch.hpp" + #endif +#endif //GENERIC_GRID_LAUNCH \ No newline at end of file diff --git a/hipamd/include/hip/hcc_detail/macro_based_grid_launch.hpp b/hipamd/include/hip/hcc_detail/macro_based_grid_launch.hpp new file mode 100644 index 0000000000..f1dfe76245 --- /dev/null +++ b/hipamd/include/hip/hcc_detail/macro_based_grid_launch.hpp @@ -0,0 +1,1004 @@ +/* +Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#include "concepts.hpp" +#include "helpers.hpp" + +#include "hc.hpp" +#include "hip/hip_hcc.h" +#include "hip_runtime.h" + +#include +#include +#include +#include +#include + +namespace hip_impl +{ + namespace + { + struct New_grid_launch_tag {}; + struct Old_grid_launch_tag {}; + + template + class RAII_guard { + D dtor_; + public: + RAII_guard() = default; + + RAII_guard(const C& ctor, D dtor) : dtor_{std::move(dtor)} + { + ctor(); + } + + RAII_guard(const RAII_guard&) = default; + RAII_guard(RAII_guard&&) = default; + + RAII_guard& operator=(const RAII_guard&) = default; + RAII_guard& operator=(RAII_guard&&) = default; + + ~RAII_guard() { dtor_(); } + }; + + template + RAII_guard make_RAII_guard(const C& ctor, D dtor) + { + return RAII_guard{ctor, std::move(dtor)}; + } + + template + using is_new_grid_launch_t = typename std::conditional< + is_callable{}, + New_grid_launch_tag, + Old_grid_launch_tag>::type; + } + + // TODO: - dispatch rank should be derived from the domain dimensions passed + // in, and not always assumed to be 3; + + template + requires(Domain == {Ts...}) + inline + void grid_launch_hip_impl_( + New_grid_launch_tag, + dim3 num_blocks, + dim3 dim_blocks, + int group_mem_bytes, + const hc::accelerator_view& acc_v, + K k) + { + const auto d = hc::extent<3>{ + num_blocks.z * dim_blocks.z, + num_blocks.y * dim_blocks.y, + num_blocks.x * dim_blocks.x}.tile_with_dynamic( + dim_blocks.z, + dim_blocks.y, + dim_blocks.x, + group_mem_bytes); + + try { + hc::parallel_for_each(acc_v, d, k); + } + catch (std::exception& ex) { + std::cerr << "Failed in " << __func__ << ", with exception: " + << ex.what() << std::endl; + throw; + } + } + + // TODO: these are workarounds, they should be removed. + + hc::accelerator_view lock_stream_hip_(hipStream_t&, void*&); + void print_prelaunch_trace_(const char*, dim3, dim3, int, hipStream_t); + void unlock_stream_hip_( + hipStream_t, void*, const char*, hc::accelerator_view*); + + template + requires(Domain == {Ts...}) + inline + void grid_launch_hip_impl_( + New_grid_launch_tag, + dim3 num_blocks, + dim3 dim_blocks, + int group_mem_bytes, + hipStream_t stream, + const char* kernel_name, + K k) + { + void* lck_stream = nullptr; + auto acc_v = lock_stream_hip_(stream, lck_stream); + auto stream_guard = make_RAII_guard( + std::bind( + print_prelaunch_trace_, + kernel_name, + num_blocks, + dim_blocks, + group_mem_bytes, + stream), + std::bind( + unlock_stream_hip_, stream, lck_stream, kernel_name, &acc_v)); + + try { + grid_launch_hip_impl_( + New_grid_launch_tag{}, + std::move(num_blocks), + std::move(dim_blocks), + group_mem_bytes, + acc_v, + std::move(k)); + } + catch (std::exception& ex) { + std::cerr << "Failed in " << __func__ << ", with exception: " + << ex.what() << std::endl; + throw; + } + } + + template + requires(Domain == {hipLaunchParm, Ts...}) + inline + void grid_launch_hip_impl_( + Old_grid_launch_tag, + dim3 num_blocks, + dim3 dim_blocks, + int group_mem_bytes, + hipStream_t stream, + K k) + { + grid_launch_hip_impl_( + New_grid_launch_tag{}, + std::move(num_blocks), + std::move(dim_blocks), + group_mem_bytes, + std::move(stream), + std::move(k)); + } + + template + requires(Domain == {hipLaunchParm, Ts...}) + inline + void grid_launch_hip_impl_( + Old_grid_launch_tag, + dim3 num_blocks, + dim3 dim_blocks, + int group_mem_bytes, + hipStream_t stream, + const char* kernel_name, + K k) + { + grid_launch_hip_impl_( + New_grid_launch_tag{}, + std::move(num_blocks), + std::move(dim_blocks), + group_mem_bytes, + std::move(stream), + kernel_name, + std::move(k)); + } + + template + requires(Domain == {Ts...}) + inline + std::enable_if_t::value> grid_launch_hip_( + dim3 num_blocks, + dim3 dim_blocks, + int group_mem_bytes, + hipStream_t stream, + const char* kernel_name, + K k) + { + grid_launch_hip_impl_( + is_new_grid_launch_t{}, + std::move(num_blocks), + std::move(dim_blocks), + group_mem_bytes, + std::move(stream), + kernel_name, + std::move(k)); + } + + template + requires(Domain == {Ts...}) + inline + std::enable_if_t::value> grid_launch_hip_( + dim3 num_blocks, + dim3 dim_blocks, + int group_mem_bytes, + hipStream_t stream, + K k) + { + grid_launch_hip_impl_( + is_new_grid_launch_t{}, + std::move(num_blocks), + std::move(dim_blocks), + group_mem_bytes, + std::move(stream), + std::move(k)); + } + + // TODO: these are temporary and purposefully noisy and disruptive. + #define make_kernel_name_hip(k, n)\ + HIP_kernel_functor_name_begin ## _ ## k ## _ ## \ + HIP_kernel_functor_name_end ## _ ## n + + #define make_kernel_functor_hip_30(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ + p24, p25, p26, p27)\ + struct make_kernel_name_hip(function_name, 28) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + std::decay_t _p22_;\ + std::decay_t _p23_;\ + std::decay_t _p24_;\ + std::decay_t _p25_;\ + std::decay_t _p26_;\ + std::decay_t _p27_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_,\ + _p26_, _p27_);\ + }\ + } + #define make_kernel_functor_hip_29(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ + p24, p25, p26)\ + struct make_kernel_name_hip(function_name, 27) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + std::decay_t _p22_;\ + std::decay_t _p23_;\ + std::decay_t _p24_;\ + std::decay_t _p25_;\ + std::decay_t _p26_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_,\ + _p26_);\ + }\ + } + #define make_kernel_functor_hip_28(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ + p24, p25)\ + struct make_kernel_name_hip(function_name, 26) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + std::decay_t _p22_;\ + std::decay_t _p23_;\ + std::decay_t _p24_;\ + std::decay_t _p25_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_);\ + }\ + } + #define make_kernel_functor_hip_27(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ + p24)\ + struct make_kernel_name_hip(function_name, 25) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + std::decay_t _p22_;\ + std::decay_t _p23_;\ + std::decay_t _p24_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_);\ + }\ + } + #define make_kernel_functor_hip_26(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23)\ + struct make_kernel_name_hip(function_name, 24) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + std::decay_t _p22_;\ + std::decay_t _p23_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_, _p22_, _p23_);\ + }\ + } + #define make_kernel_functor_hip_25(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22)\ + struct make_kernel_name_hip(function_name, 23) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + std::decay_t _p22_;\ + __attribute__((used, flatten))\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_, _p22_);\ + }\ + } + #define make_kernel_functor_hip_24(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21)\ + struct make_kernel_name_hip(function_name, 22) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_);\ + }\ + } + #define make_kernel_functor_hip_23(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20)\ + struct make_kernel_name_hip(function_name, 21) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_);\ + }\ + } + #define make_kernel_functor_hip_22(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19)\ + struct make_kernel_name_hip(function_name, 20) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_);\ + }\ + } + #define make_kernel_functor_hip_21(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18)\ + struct make_kernel_name_hip(function_name, 19) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_);\ + }\ + } + #define make_kernel_functor_hip_20(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17)\ + struct make_kernel_name_hip(function_name, 18) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_);\ + }\ + } + #define make_kernel_functor_hip_19(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16)\ + struct make_kernel_name_hip(function_name, 17) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_);\ + }\ + } + #define make_kernel_functor_hip_18(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15)\ + struct make_kernel_name_hip(function_name, 16) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_);\ + }\ + } + #define make_kernel_functor_hip_17(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14)\ + struct make_kernel_name_hip(function_name, 15) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_);\ + }\ + } + #define make_kernel_functor_hip_16(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13)\ + struct make_kernel_name_hip(function_name, 14) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_);\ + }\ + } + #define make_kernel_functor_hip_15(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12)\ + struct make_kernel_name_hip(function_name, 13) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_);\ + }\ + } + #define make_kernel_functor_hip_14(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11)\ + struct make_kernel_name_hip(function_name, 12) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_);\ + }\ + } + #define make_kernel_functor_hip_13(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10)\ + struct make_kernel_name_hip(function_name, 11) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_);\ + }\ + } + #define make_kernel_functor_hip_12(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9)\ + struct make_kernel_name_hip(function_name, 10) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ + _p9_);\ + }\ + } + #define make_kernel_functor_hip_11(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8)\ + struct make_kernel_name_hip(function_name, 9) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_);\ + }\ + } + #define make_kernel_functor_hip_10(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7)\ + struct make_kernel_name_hip(function_name, 8) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_);\ + }\ + } + #define make_kernel_functor_hip_9(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6)\ + struct make_kernel_name_hip(function_name, 7) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_);\ + }\ + } + #define make_kernel_functor_hip_8(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5)\ + struct make_kernel_name_hip(function_name, 6) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_);\ + }\ + } + #define make_kernel_functor_hip_7(\ + function_name, kernel_name, p0, p1, p2, p3, p4)\ + struct make_kernel_name_hip(function_name, 5) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_);\ + }\ + } + #define make_kernel_functor_hip_6(function_name, kernel_name, p0, p1, p2, p3)\ + struct make_kernel_name_hip(function_name, 4) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_, _p3_);\ + }\ + } + #define make_kernel_functor_hip_5(function_name, kernel_name, p0, p1, p2)\ + struct make_kernel_name_hip(function_name, 3) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_);\ + }\ + } + #define make_kernel_functor_hip_4(function_name, kernel_name, p0, p1)\ + struct make_kernel_name_hip(function_name, 2) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_);\ + }\ + } + #define fofo(f, n) kernel_prefix_hip ## f ## kernel_suffix_hip ## n + #define make_kernel_functor_hip_3(function_name, kernel_name, p0)\ + struct make_kernel_name_hip(function_name, 1) {\ + std::decay_t _p0_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_);\ + }\ + } + #define make_kernel_functor_hip_2(function_name, kernel_name)\ + struct make_kernel_name_hip(function_name, 0) {\ + void operator()(const hc::tiled_index<3>&) [[hc]]\ + {\ + return kernel_name(hipLaunchParm{});\ + }\ + } + #define make_kernel_functor_hip_1(...) + #define make_kernel_functor_hip_0(...) + #define make_kernel_functor_hip_(...)\ + overload_macro_hip_(make_kernel_functor_hip_, __VA_ARGS__) + + + #define hipLaunchNamedKernelGGL(\ + function_name,\ + kernel_name,\ + num_blocks,\ + dim_blocks,\ + group_mem_bytes,\ + stream,\ + ...)\ + do {\ + make_kernel_functor_hip_(function_name, kernel_name, __VA_ARGS__)\ + hip_kernel_functor_impl_{__VA_ARGS__};\ + hip_impl::grid_launch_hip_(\ + num_blocks,\ + dim_blocks,\ + group_mem_bytes,\ + stream,\ + #kernel_name,\ + hip_kernel_functor_impl_);\ + } while(0) + + #define hipLaunchKernelGGL(\ + kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)\ + do {\ + hipLaunchNamedKernelGGL(\ + unnamed,\ + kernel_name,\ + num_blocks,\ + dim_blocks,\ + group_mem_bytes,\ + stream,\ + ##__VA_ARGS__);\ + } while (0) + + #define hipLaunchKernel(\ + kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)\ + do {\ + hipLaunchKernelGGL(\ + kernel_name,\ + num_blocks,\ + dim_blocks,\ + group_mem_bytes,\ + stream,\ + hipLaunchParm{},\ + ##__VA_ARGS__);\ + } while(0) +} \ No newline at end of file diff --git a/hipamd/src/functional_grid_launch.inl b/hipamd/src/functional_grid_launch.inl new file mode 100644 index 0000000000..4a26f66c8c --- /dev/null +++ b/hipamd/src/functional_grid_launch.inl @@ -0,0 +1,138 @@ +/* +Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "hip/hcc_detail/grid_launch_GGL.hpp" +#include "hip/hcc_detail/program_state.hpp" + +#include "hip/hip_runtime_api.h" + +// Internal header, do not percolate upwards. +#include "hip_hcc_internal.h" +#include "hc.hpp" +#include "trace_helper.h" + +#include +#include +#include +#include + +#include + +using namespace hc; +using namespace std; + +namespace hip_impl +{ + namespace + { + inline + string name(uintptr_t function_address) + { + const auto it = function_names().find(function_address); + + if (it == function_names().cend()) { + throw runtime_error{ + "Invalid function passed to hipLaunchKernelGGL."}; + } + + return it->second; + } + + inline + string name(hsa_agent_t agent) + { + char n[64] = {}; + hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, n); + + return string{n}; + } + + inline + hsa_agent_t target_agent(hipStream_t stream) + { + if (stream) { + return *static_cast( + stream->locked_getAv()->get_hsa_agent()); + } + else if ( + ihipGetTlsDefaultCtx() && ihipGetTlsDefaultCtx()->getDevice()) { + return ihipGetDevice( + ihipGetTlsDefaultCtx()->getDevice()->_deviceId)->_hsaAgent; + } + else { + return *static_cast( + accelerator{}.get_default_view().get_hsa_agent()); + } + } + } + + void hipLaunchKernelGGLImpl( + uintptr_t function_address, + const dim3& numBlocks, + const dim3& dimBlocks, + uint32_t sharedMemBytes, + hipStream_t stream, + void** kernarg) + { + const auto it0 = functions().find(function_address); + + if (it0 == functions().cend()) { + throw runtime_error{ + "No device code available for function: " + + name(function_address) + }; + } + + auto agent = target_agent(stream); + + const auto it1 = find_if( + it0->second.cbegin(), + it0->second.cend(), + [=](const pair& x) { + return x.first.handle == agent.handle; + }); + + if (it1 == it0->second.cend()) { + throw runtime_error{ + "No code available for function: " + name(function_address) + + ", for agent: " + name(agent) + }; + } + + for (auto&& agent_kernel : it0->second) { + if (agent.handle == agent_kernel.first.handle) { + hipModuleLaunchKernel( + agent_kernel.second, + numBlocks.x, + numBlocks.y, + numBlocks.z, + dimBlocks.x, + dimBlocks.y, + dimBlocks.z, + sharedMemBytes, + stream, + nullptr, + kernarg); + } + } + } +} diff --git a/hipamd/src/grid_launch.cpp b/hipamd/src/grid_launch.cpp index 4a26f66c8c..484d314fba 100644 --- a/hipamd/src/grid_launch.cpp +++ b/hipamd/src/grid_launch.cpp @@ -20,119 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "hip/hcc_detail/grid_launch_GGL.hpp" -#include "hip/hcc_detail/program_state.hpp" - -#include "hip/hip_runtime_api.h" - -// Internal header, do not percolate upwards. -#include "hip_hcc_internal.h" -#include "hc.hpp" -#include "trace_helper.h" - -#include -#include -#include -#include - -#include - -using namespace hc; -using namespace std; - -namespace hip_impl -{ - namespace - { - inline - string name(uintptr_t function_address) - { - const auto it = function_names().find(function_address); - - if (it == function_names().cend()) { - throw runtime_error{ - "Invalid function passed to hipLaunchKernelGGL."}; - } - - return it->second; - } - - inline - string name(hsa_agent_t agent) - { - char n[64] = {}; - hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, n); - - return string{n}; - } - - inline - hsa_agent_t target_agent(hipStream_t stream) - { - if (stream) { - return *static_cast( - stream->locked_getAv()->get_hsa_agent()); - } - else if ( - ihipGetTlsDefaultCtx() && ihipGetTlsDefaultCtx()->getDevice()) { - return ihipGetDevice( - ihipGetTlsDefaultCtx()->getDevice()->_deviceId)->_hsaAgent; - } - else { - return *static_cast( - accelerator{}.get_default_view().get_hsa_agent()); - } - } - } - - void hipLaunchKernelGGLImpl( - uintptr_t function_address, - const dim3& numBlocks, - const dim3& dimBlocks, - uint32_t sharedMemBytes, - hipStream_t stream, - void** kernarg) - { - const auto it0 = functions().find(function_address); - - if (it0 == functions().cend()) { - throw runtime_error{ - "No device code available for function: " + - name(function_address) - }; - } - - auto agent = target_agent(stream); - - const auto it1 = find_if( - it0->second.cbegin(), - it0->second.cend(), - [=](const pair& x) { - return x.first.handle == agent.handle; - }); - - if (it1 == it0->second.cend()) { - throw runtime_error{ - "No code available for function: " + name(function_address) + - ", for agent: " + name(agent) - }; - } - - for (auto&& agent_kernel : it0->second) { - if (agent.handle == agent_kernel.first.handle) { - hipModuleLaunchKernel( - agent_kernel.second, - numBlocks.x, - numBlocks.y, - numBlocks.z, - dimBlocks.x, - dimBlocks.y, - dimBlocks.z, - sharedMemBytes, - stream, - nullptr, - kernarg); - } - } - } -} +#if defined(FUNCTIONAL_GRID_LAUNCH) + #include "functional_grid_launch.inl" +#else + #include "macro_based_grid_launch.inl" +#endif \ No newline at end of file diff --git a/hipamd/src/hip_module.cpp b/hipamd/src/hip_module.cpp index 00ffd8b03b..1477247ae2 100644 --- a/hipamd/src/hip_module.cpp +++ b/hipamd/src/hip_module.cpp @@ -37,6 +37,7 @@ THE SOFTWARE. #include "elfio/elfio.hpp" #include "hip/hip_runtime.h" +#include "hip/hcc_detail/program_state.hpp" #include "hip_hcc_internal.h" #include "trace_helper.h" diff --git a/hipamd/src/macro_based_grid_launch.inl b/hipamd/src/macro_based_grid_launch.inl new file mode 100644 index 0000000000..ad5340c097 --- /dev/null +++ b/hipamd/src/macro_based_grid_launch.inl @@ -0,0 +1,99 @@ +/* +Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "hip/hcc_detail/grid_launch_GGL.hpp" + +// Internal header, do not percolate upwards. +#include "hip_hcc_internal.h" +#include "hc.hpp" +#include "trace_helper.h" + +#include +#include + +namespace hip_impl +{ + hc::accelerator_view lock_stream_hip_( + hipStream_t& stream, void*& locked_stream) + { // This allocated but does not take ownership of locked_stream. If it is + // not deleted elsewhere it will leak. + using L = decltype(stream->lockopen_preKernelCommand()); + + HIP_INIT(); + + stream = ihipSyncAndResolveStream(stream); + locked_stream = new L{stream->lockopen_preKernelCommand()}; + return (*static_cast(locked_stream))->_av; + } + + void print_prelaunch_trace_( + const char* kernel_name, + dim3 num_blocks, + dim3 dim_blocks, + int group_mem_bytes, + hipStream_t stream) + { + if ((HIP_TRACE_API & (1 << TRACE_KCMD)) || + HIP_PROFILE_API || + (COMPILE_HIP_DB && (HIP_TRACE_API & (1<lockopen_preKernelCommand()); + + stream->lockclose_postKernelCommand(kernel_name, acc_v); + + delete static_cast(locked_stream); + locked_stream = nullptr; + if(HIP_PROFILE_API) { + MARKER_END(); + } + } +} \ No newline at end of file From 37c1811b2a604f48e6d21d178fcf6145417bc468 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 29 Nov 2017 01:37:52 +0000 Subject: [PATCH 19/27] Fix oversight in selection mechanism which led to erroneous code to be compiled for the grid_launch_GGL component. --- hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp | 2 +- hipamd/src/functional_grid_launch.inl | 1 - hipamd/src/grid_launch.cpp | 4 +++- hipamd/src/macro_based_grid_launch.inl | 2 -- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp b/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp index 187d84dbff..10dae540a4 100644 --- a/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp +++ b/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp @@ -22,7 +22,7 @@ THE SOFTWARE. #if GENERIC_GRID_LAUNCH == 1 #if __hcc_workweek__ >= 17481 - #define FUNCTIONAL_GRID_LAUNCH + #define FUNCTIONAL_GRID_LAUNCH 1 #include "functional_grid_launch.hpp" #else #include "macro_based_grid_launch.hpp" diff --git a/hipamd/src/functional_grid_launch.inl b/hipamd/src/functional_grid_launch.inl index 4a26f66c8c..b555967ebc 100644 --- a/hipamd/src/functional_grid_launch.inl +++ b/hipamd/src/functional_grid_launch.inl @@ -20,7 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "hip/hcc_detail/grid_launch_GGL.hpp" #include "hip/hcc_detail/program_state.hpp" #include "hip/hip_runtime_api.h" diff --git a/hipamd/src/grid_launch.cpp b/hipamd/src/grid_launch.cpp index 484d314fba..d63fd2d49a 100644 --- a/hipamd/src/grid_launch.cpp +++ b/hipamd/src/grid_launch.cpp @@ -20,7 +20,9 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#if defined(FUNCTIONAL_GRID_LAUNCH) +#include "hip/hcc_detail/grid_launch_GGL.hpp" + +#if !defined(FUNCTIONAL_GRID_LAUNCH) #include "functional_grid_launch.inl" #else #include "macro_based_grid_launch.inl" diff --git a/hipamd/src/macro_based_grid_launch.inl b/hipamd/src/macro_based_grid_launch.inl index ad5340c097..5547d3a71a 100644 --- a/hipamd/src/macro_based_grid_launch.inl +++ b/hipamd/src/macro_based_grid_launch.inl @@ -20,8 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "hip/hcc_detail/grid_launch_GGL.hpp" - // Internal header, do not percolate upwards. #include "hip_hcc_internal.h" #include "hc.hpp" From e305d3fc94921dc55a488dac8e1e201135bb8b2b Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 29 Nov 2017 02:16:44 +0000 Subject: [PATCH 20/27] Add missing file. --- hipamd/src/grid_launch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hipamd/src/grid_launch.cpp b/hipamd/src/grid_launch.cpp index d63fd2d49a..8eb3f1dc75 100644 --- a/hipamd/src/grid_launch.cpp +++ b/hipamd/src/grid_launch.cpp @@ -22,7 +22,7 @@ THE SOFTWARE. #include "hip/hcc_detail/grid_launch_GGL.hpp" -#if !defined(FUNCTIONAL_GRID_LAUNCH) +#if defined(FUNCTIONAL_GRID_LAUNCH) #include "functional_grid_launch.inl" #else #include "macro_based_grid_launch.inl" From 93e595c26720bcbf0d2db1651374971763584917 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 29 Nov 2017 03:05:53 +0000 Subject: [PATCH 21/27] Fix compiler version check. --- hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp | 2 +- hipamd/src/grid_launch.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp b/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp index 10dae540a4..95903436b6 100644 --- a/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp +++ b/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp @@ -19,10 +19,10 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#pragma once #if GENERIC_GRID_LAUNCH == 1 #if __hcc_workweek__ >= 17481 - #define FUNCTIONAL_GRID_LAUNCH 1 #include "functional_grid_launch.hpp" #else #include "macro_based_grid_launch.hpp" diff --git a/hipamd/src/grid_launch.cpp b/hipamd/src/grid_launch.cpp index 8eb3f1dc75..1fe47c189a 100644 --- a/hipamd/src/grid_launch.cpp +++ b/hipamd/src/grid_launch.cpp @@ -22,7 +22,7 @@ THE SOFTWARE. #include "hip/hcc_detail/grid_launch_GGL.hpp" -#if defined(FUNCTIONAL_GRID_LAUNCH) +#if __hcc_workweek__ >= 17481 #include "functional_grid_launch.inl" #else #include "macro_based_grid_launch.inl" From 4792475d0168365403535e4f4d2960cbd2a2a4a9 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 29 Nov 2017 21:01:28 +0000 Subject: [PATCH 22/27] Revert adoption of CUDA indexing in general - this can only work with later versions of the compiler, just like module based dispatch, and thus must be guarded against usage in earlier (e.g. 1.6) versions. --- hipamd/include/hip/hcc_detail/hip_runtime.h | 40 ++++++++++--------- hipamd/samples/0_Intro/square/square.cu | 4 +- hipamd/src/device_util.cpp | 4 +- hipamd/src/hip_memory.cpp | 4 +- .../device/hipFuncDeviceSynchronize.cpp | 2 +- hipamd/tests/src/deviceLib/hipComplex.cpp | 2 +- .../tests/src/deviceLib/hipDeviceMemcpy.cpp | 4 +- hipamd/tests/src/deviceLib/hipFloatMath.cpp | 2 +- .../src/deviceLib/hipSimpleAtomicsTest.cpp | 2 +- hipamd/tests/src/deviceLib/hipTestDevice.cpp | 32 +++++++-------- .../src/deviceLib/hipTestDeviceDouble.cpp | 28 ++++++------- .../src/deviceLib/hipTestDeviceSymbol.cpp | 2 +- hipamd/tests/src/deviceLib/hipTestHalf.cpp | 4 +- hipamd/tests/src/deviceLib/hipThreadFence.cpp | 2 +- hipamd/tests/src/deviceLib/hip_anyall.cpp | 6 +-- hipamd/tests/src/deviceLib/hip_ballot.cpp | 8 ++-- hipamd/tests/src/deviceLib/hip_brev.cpp | 4 +- hipamd/tests/src/deviceLib/hip_clz.cpp | 4 +- hipamd/tests/src/deviceLib/hip_ffs.cpp | 4 +- hipamd/tests/src/deviceLib/hip_popc.cpp | 4 +- hipamd/tests/src/deviceLib/hip_test_ldg.cpp | 4 +- .../src/deviceLib/hip_test_make_type.cpp | 20 +++++----- hipamd/tests/src/deviceLib/hip_trig.cpp | 2 +- hipamd/tests/src/experimental/xcompile/hHip.c | 2 +- .../src/experimental/xcompile/hipxxKer.cpp | 2 +- .../src/experimental/xcompile/hxxHip.cpp | 2 +- hipamd/tests/src/hipC.c | 2 +- hipamd/tests/src/hipC.cpp | 2 +- hipamd/tests/src/hipCKernel.c | 2 +- hipamd/tests/src/kernel/hipDynamicShared.cpp | 4 +- hipamd/tests/src/kernel/hipDynamicShared2.cpp | 2 +- hipamd/tests/src/kernel/hipGridLaunch.cpp | 4 +- .../src/kernel/hipLanguageExtensions.cpp | 8 ++-- hipamd/tests/src/kernel/hipTestConstant.cpp | 2 +- .../tests/src/kernel/hipTestMallocKernel.cpp | 4 +- hipamd/tests/src/kernel/hipTestMemKernel.cpp | 20 +++++----- hipamd/tests/src/kernel/inline_asm_vadd.cpp | 2 +- hipamd/tests/src/kernel/inline_asm_vmac.cpp | 2 +- hipamd/tests/src/kernel/launch_bounds.cpp | 2 +- .../device/hipDeviceSynchronize.cpp | 2 +- .../src/runtimeApi/memory/hipHostGetFlags.cpp | 2 +- .../src/runtimeApi/memory/hipHostMalloc.cpp | 4 +- .../src/runtimeApi/memory/hipHostRegister.cpp | 2 +- .../src/runtimeApi/memory/hipMemcpyAsync.cpp | 4 +- .../memory/hipMemoryAllocateCoherent.cpp | 2 +- .../runtimeApi/memory/p2p_copy_coherency.cpp | 8 ++-- .../tests/src/runtimeApi/module/hipModule.cpp | 2 +- .../src/runtimeApi/module/vcpy_kernel.cpp | 2 +- .../multiThread/hipMultiThreadStreams2.cpp | 2 +- .../runtimeApi/stream/hipAPIStreamDisable.cpp | 4 +- .../runtimeApi/stream/hipAPIStreamEnable.cpp | 4 +- .../src/runtimeApi/stream/hipNullStream.cpp | 4 +- .../tests/src/runtimeApi/stream/hipStream.h | 2 +- .../synchronization/copy_coherency.cpp | 8 ++-- .../synchronization/memcpyInt.device.cpp | 4 +- hipamd/tests/src/specialFunc.cu | 2 +- hipamd/tests/src/stress/hipStressAsync.cpp | 2 +- hipamd/tests/src/stress/hipStressChain.cpp | 2 +- hipamd/tests/src/stress/hipStressKernel.cpp | 2 +- hipamd/tests/src/stress/hipStressSync.cpp | 2 +- hipamd/tests/src/test_common.h | 20 +++++----- hipamd/tests/src/texture/hipTextureObj2D.cpp | 4 +- hipamd/tests/src/texture/hipTextureRef2D.cpp | 4 +- 63 files changed, 173 insertions(+), 171 deletions(-) diff --git a/hipamd/include/hip/hcc_detail/hip_runtime.h b/hipamd/include/hip/hcc_detail/hip_runtime.h index 924e774af0..944f74864b 100644 --- a/hipamd/include/hip/hcc_detail/hip_runtime.h +++ b/hipamd/include/hip/hcc_detail/hip_runtime.h @@ -381,27 +381,29 @@ __device__ void __threadfence_system(void) ; * @} */ -template::type f> -class Coordinates { - using R = decltype(f(0)); +#if __hcc_workweek__ >= 17481 + template::type f> + class Coordinates { + using R = decltype(f(0)); - struct X { __device__ operator R() const { return f(0); } }; - struct Y { __device__ operator R() const { return f(1); } }; - struct Z { __device__ operator R() const { return f(2); } }; -public: - static constexpr X x{}; - static constexpr Y y{}; - static constexpr Z z{}; -}; + struct X { __device__ operator R() const { return f(0); } }; + struct Y { __device__ operator R() const { return f(1); } }; + struct Z { __device__ operator R() const { return f(2); } }; + public: + static constexpr X x{}; + static constexpr Y y{}; + static constexpr Z z{}; + }; -static constexpr Coordinates blockDim; -static constexpr Coordinates blockIdx; -static constexpr Coordinates gridDim; -static constexpr Coordinates threadIdx; + static constexpr Coordinates blockDim; + static constexpr Coordinates blockIdx; + static constexpr Coordinates gridDim; + static constexpr Coordinates threadIdx; +#endif #define hipThreadIdx_x (hc_get_workitem_id(0)) #define hipThreadIdx_y (hc_get_workitem_id(1)) diff --git a/hipamd/samples/0_Intro/square/square.cu b/hipamd/samples/0_Intro/square/square.cu index 82b31db14a..ccaa9ae0bc 100644 --- a/hipamd/samples/0_Intro/square/square.cu +++ b/hipamd/samples/0_Intro/square/square.cu @@ -40,8 +40,8 @@ template __global__ void vector_square(T *C_d, const T *A_d, size_t N) { - size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); - size_t stride = blockDim.x * gridDim.x ; + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; for (size_t i=offset; i(&f[idx]), diff --git a/hipamd/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp b/hipamd/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp index c8c2e644c3..dac56bf709 100644 --- a/hipamd/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp +++ b/hipamd/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp @@ -34,7 +34,7 @@ THE SOFTWARE. #define NUM_STREAMS 2 __global__ void Iter(hipLaunchParm lp, int *Ad, int num){ - int tx = threadIdx.x + blockIdx.x * blockDim.x; + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; // Kernel loop designed to execute very slowly... ... ... so we can test timing-related behavior below if(tx == 0){ for(int i = 0; i>pshift] = __any(tid -77); - device_all[threadIdx.x>>pshift] = __all(tid -77); + int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + device_any[hipThreadIdx_x>>pshift] = __any(tid -77); + device_all[hipThreadIdx_x>>pshift] = __all(tid -77); } int main(int argc, char *argv[]) diff --git a/hipamd/tests/src/deviceLib/hip_ballot.cpp b/hipamd/tests/src/deviceLib/hip_ballot.cpp index 14b8f314a1..742c47a065 100644 --- a/hipamd/tests/src/deviceLib/hip_ballot.cpp +++ b/hipamd/tests/src/deviceLib/hip_ballot.cpp @@ -34,12 +34,12 @@ __global__ void gpu_ballot(hipLaunchParm lp, unsigned int* device_ballot, int Num_Warps_per_Block,int pshift) { - int tid = threadIdx.x + blockIdx.x * blockDim.x; - const unsigned int warp_num = threadIdx.x >> pshift; + int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + const unsigned int warp_num = hipThreadIdx_x >> pshift; #ifdef __HIP_PLATFORM_HCC__ - atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popcll(__ballot(tid - 245))); + atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popcll(__ballot(tid - 245))); #else - atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popc(__ballot(tid - 245))); + atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popc(__ballot(tid - 245))); #endif } diff --git a/hipamd/tests/src/deviceLib/hip_brev.cpp b/hipamd/tests/src/deviceLib/hip_brev.cpp index c08c39dec9..855a8bec47 100644 --- a/hipamd/tests/src/deviceLib/hip_brev.cpp +++ b/hipamd/tests/src/deviceLib/hip_brev.cpp @@ -72,8 +72,8 @@ HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b,unsigned long long int* c, unsigned long long int* d, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { diff --git a/hipamd/tests/src/deviceLib/hip_clz.cpp b/hipamd/tests/src/deviceLib/hip_clz.cpp index 53fd611184..bdb31f3e8d 100644 --- a/hipamd/tests/src/deviceLib/hip_clz.cpp +++ b/hipamd/tests/src/deviceLib/hip_clz.cpp @@ -83,8 +83,8 @@ HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b,unsigned int* c, unsigned long long int* d, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { diff --git a/hipamd/tests/src/deviceLib/hip_ffs.cpp b/hipamd/tests/src/deviceLib/hip_ffs.cpp index 49530bb298..c855ede060 100644 --- a/hipamd/tests/src/deviceLib/hip_ffs.cpp +++ b/hipamd/tests/src/deviceLib/hip_ffs.cpp @@ -73,8 +73,8 @@ HIP_kernel(hipLaunchParm lp, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { diff --git a/hipamd/tests/src/deviceLib/hip_popc.cpp b/hipamd/tests/src/deviceLib/hip_popc.cpp index 19dafb4d43..e503e55b42 100644 --- a/hipamd/tests/src/deviceLib/hip_popc.cpp +++ b/hipamd/tests/src/deviceLib/hip_popc.cpp @@ -64,8 +64,8 @@ HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b,unsigned int* c, unsigned long long int* d, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { diff --git a/hipamd/tests/src/deviceLib/hip_test_ldg.cpp b/hipamd/tests/src/deviceLib/hip_test_ldg.cpp index 4db522cc10..5540c4917d 100644 --- a/hipamd/tests/src/deviceLib/hip_test_ldg.cpp +++ b/hipamd/tests/src/deviceLib/hip_test_ldg.cpp @@ -57,8 +57,8 @@ vectoradd_float(hipLaunchParm lp, T* a, const T* bm, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { diff --git a/hipamd/tests/src/deviceLib/hip_test_make_type.cpp b/hipamd/tests/src/deviceLib/hip_test_make_type.cpp index 6eba236e12..ce689ceb89 100644 --- a/hipamd/tests/src/deviceLib/hip_test_make_type.cpp +++ b/hipamd/tests/src/deviceLib/hip_test_make_type.cpp @@ -45,8 +45,8 @@ vectoradd_char1(hipLaunchParm lp, char1* a, const char1* bm, const char1* cm, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { @@ -59,8 +59,8 @@ vectoradd_char2(hipLaunchParm lp, char2* a, const char2* bm, const char2* cm, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { @@ -73,8 +73,8 @@ vectoradd_char3(hipLaunchParm lp, char3* a, const char3* bm, const char3* cm, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { @@ -86,8 +86,8 @@ vectoradd_char4(hipLaunchParm lp, char4* a, const char4* bm, const char4* cm, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { @@ -100,8 +100,8 @@ vectoradd_char4(hipLaunchParm lp, __kernel__ void vectoradd_float(float* a, const float* b, const float* c, int width, int height) { - int x = blockDimX * blockIdx.x + threadIdx.x; - int y = blockDimY * blockIdy.y + threadIdx.y; + int x = blockDimX * hipBlockIdx_x + hipThreadIdx_x; + int y = blockDimY * blockIdy.y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { diff --git a/hipamd/tests/src/deviceLib/hip_trig.cpp b/hipamd/tests/src/deviceLib/hip_trig.cpp index 6ee8dc58ad..5ec28101f3 100644 --- a/hipamd/tests/src/deviceLib/hip_trig.cpp +++ b/hipamd/tests/src/deviceLib/hip_trig.cpp @@ -36,7 +36,7 @@ THE SOFTWARE. #define SIZE LEN<<2 __global__ void kernel_trig(hipLaunchParm lp, float *In, float *sin_d, float *cos_d, float *tan_d, float *sin_pd, float *cos_pd){ - int tid = threadIdx.x + blockIdx.x * blockDim.x; + int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; sin_d[tid] = __sinf(In[tid]); cos_d[tid] = __cosf(In[tid]); tan_d[tid] = __tanf(In[tid]); diff --git a/hipamd/tests/src/experimental/xcompile/hHip.c b/hipamd/tests/src/experimental/xcompile/hHip.c index 17e7e9ecf6..2ac4ebc73e 100644 --- a/hipamd/tests/src/experimental/xcompile/hHip.c +++ b/hipamd/tests/src/experimental/xcompile/hHip.c @@ -29,7 +29,7 @@ THE SOFTWARE. __global__ void Add(hipLaunchParm lp, float *Ad, float *Bd, float *Cd, size_t len) { - int tx = threadIdx.x + blockIdx.x * blockDim.x; + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; if(tx < len) { Cd[tx] = Ad[tx] + Bd[tx]; diff --git a/hipamd/tests/src/experimental/xcompile/hipxxKer.cpp b/hipamd/tests/src/experimental/xcompile/hipxxKer.cpp index 5dca6c1bca..d1bbed63cd 100644 --- a/hipamd/tests/src/experimental/xcompile/hipxxKer.cpp +++ b/hipamd/tests/src/experimental/xcompile/hipxxKer.cpp @@ -30,7 +30,7 @@ THE SOFTWARE. __global__ void Kern(hipLaunchParm lp, float *A) { - int tx = threadIdx.x + blockIdx.x * blockDim.x; + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; A[tx] += 1.0f; } diff --git a/hipamd/tests/src/experimental/xcompile/hxxHip.cpp b/hipamd/tests/src/experimental/xcompile/hxxHip.cpp index bca5d64afc..6a748d5c89 100644 --- a/hipamd/tests/src/experimental/xcompile/hxxHip.cpp +++ b/hipamd/tests/src/experimental/xcompile/hxxHip.cpp @@ -33,7 +33,7 @@ class memManager; template __global__ void Add(hipLaunchParm lp, T* Ad, T* Bd, T* Cd, size_t Len) { - int tx = threadIdx.x + blockIdx.x * blockDim.x; + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; if(tx < Len) { Cd[tx] = Ad[tx] + Bd[tx]; diff --git a/hipamd/tests/src/hipC.c b/hipamd/tests/src/hipC.c index efa03bb909..644df6c98f 100644 --- a/hipamd/tests/src/hipC.c +++ b/hipamd/tests/src/hipC.c @@ -34,7 +34,7 @@ THE SOFTWARE. #define SIZE 1024*1024*sizeof(int) __global__ void Iter(hipLaunchParm lp, int *Ad){ - int tx = threadIdx.x + blockIdx.x * blockDim.x; + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; if(tx == 0){ for(int i=0;i(my_sdata); #endif - size_t gid = (blockIdx.x * blockDim.x + threadIdx.x); - size_t tid = threadIdx.x; + size_t gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t tid = hipThreadIdx_x; // initialize dynamic shared memory if (tid < groupElements) { diff --git a/hipamd/tests/src/kernel/hipDynamicShared2.cpp b/hipamd/tests/src/kernel/hipDynamicShared2.cpp index 4567ff6fc2..95e70a9956 100644 --- a/hipamd/tests/src/kernel/hipDynamicShared2.cpp +++ b/hipamd/tests/src/kernel/hipDynamicShared2.cpp @@ -34,7 +34,7 @@ THE SOFTWARE. __global__ void vectorAdd(hipLaunchParm lp, float *Ad, float *Bd) { HIP_DYNAMIC_SHARED(float, sBd); - int tx = threadIdx.x; + int tx = hipThreadIdx_x; for(int i=0;i __global__ void Inc(hipLaunchParm lp, float *Ad){ - int tx = threadIdx.x + blockIdx.x * blockDim.x; + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; Ad[tx] = Ad[tx] + float(1); } diff --git a/hipamd/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp b/hipamd/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp index 5cd46c808a..c4f4b23dc0 100644 --- a/hipamd/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp +++ b/hipamd/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp @@ -70,8 +70,8 @@ template __global__ void addK (hipLaunchParm lp, T *A, T K, size_t numElements) { - size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); - size_t stride = blockDim.x * gridDim.x ; + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; for (size_t i=offset; i __global__ void Inc(hipLaunchParm lp, T *Array){ -int tx = threadIdx.x + blockIdx.x * blockDim.x; +int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; Array[tx] = Array[tx] + T(1); } diff --git a/hipamd/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp b/hipamd/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp index 66b93a164f..4e343121ed 100644 --- a/hipamd/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp +++ b/hipamd/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp @@ -29,7 +29,7 @@ THE SOFTWARE. const int NN = 1 << 21; __global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){ - int tid = threadIdx.x; + int tid = hipThreadIdx_x; if(tid < 1){ for(int i=0;i __global__ void Inc(hipLaunchParm lp, T *In){ -int tx = threadIdx.x + blockIdx.x * blockDim.x; +int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; In[tx] = In[tx] + 1; } diff --git a/hipamd/tests/src/runtimeApi/synchronization/copy_coherency.cpp b/hipamd/tests/src/runtimeApi/synchronization/copy_coherency.cpp index b2a66f61e2..e4bfb98206 100644 --- a/hipamd/tests/src/runtimeApi/synchronization/copy_coherency.cpp +++ b/hipamd/tests/src/runtimeApi/synchronization/copy_coherency.cpp @@ -102,8 +102,8 @@ MemcpyFunction g_moduleMemcpy("memcpyInt.hsaco", "memcpyIntKernel"); __global__ void memsetIntKernel(int * ptr, const int val, size_t numElements) { - int gid = (blockIdx.x * blockDim.x + threadIdx.x); - int stride = blockDim.x * gridDim.x ; + int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + int stride = hipBlockDim_x * hipGridDim_x ; for (size_t i= gid; i< numElements; i+=stride){ ptr[i] = val; } @@ -112,8 +112,8 @@ memsetIntKernel(int * ptr, const int val, size_t numElements) __global__ void memcpyIntKernel(int *dst, const int * src, size_t numElements) { - int gid = (blockIdx.x * blockDim.x + threadIdx.x); - int stride = blockDim.x * gridDim.x ; + int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + int stride = hipBlockDim_x * hipGridDim_x ; for (size_t i= gid; i< numElements; i+=stride){ dst[i] = src[i]; } diff --git a/hipamd/tests/src/runtimeApi/synchronization/memcpyInt.device.cpp b/hipamd/tests/src/runtimeApi/synchronization/memcpyInt.device.cpp index 2916d51bf9..b34d331682 100644 --- a/hipamd/tests/src/runtimeApi/synchronization/memcpyInt.device.cpp +++ b/hipamd/tests/src/runtimeApi/synchronization/memcpyInt.device.cpp @@ -5,8 +5,8 @@ extern "C" __global__ void memcpyIntKernel(hipLaunchParm lp, int *dst, const int * src, size_t numElements) { - int gid = (blockIdx.x * blockDim.x + threadIdx.x); - int stride = blockDim.x * gridDim.x ; + int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + int stride = hipBlockDim_x * hipGridDim_x ; for (size_t i= gid; i< numElements; i+=stride){ dst[i] = src[i]; } diff --git a/hipamd/tests/src/specialFunc.cu b/hipamd/tests/src/specialFunc.cu index 085be062d9..744dcd8926 100644 --- a/hipamd/tests/src/specialFunc.cu +++ b/hipamd/tests/src/specialFunc.cu @@ -23,7 +23,7 @@ THE SOFTWARE. void __global__ test_kernel(float *A) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; + int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; float a = __ballot(tid < 16); float b = __shfl(tid < 16); diff --git a/hipamd/tests/src/stress/hipStressAsync.cpp b/hipamd/tests/src/stress/hipStressAsync.cpp index a142b41730..1f8cab1a36 100644 --- a/hipamd/tests/src/stress/hipStressAsync.cpp +++ b/hipamd/tests/src/stress/hipStressAsync.cpp @@ -30,7 +30,7 @@ THE SOFTWARE. #define ITER 1<<10 __global__ void Iter(hipLaunchParm lp, int *Ad, int num){ - int tx = threadIdx.x + blockIdx.x * blockDim.x; + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; if(tx == 0){ for(int i = 0; i=0; i-=stride) { C_d[i] = A_d[i] + B_d[i]; @@ -169,8 +169,8 @@ addCount( const T *A_d, size_t NELEM, int count) { - size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); - size_t stride = blockDim.x * gridDim.x ; + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; // Deliberately do this in an inefficient way to increase kernel runtime for (int i=0; i=0; i-=stride) { C_d[i] = val; diff --git a/hipamd/tests/src/texture/hipTextureObj2D.cpp b/hipamd/tests/src/texture/hipTextureObj2D.cpp index 9ddafd6b1c..443d708418 100644 --- a/hipamd/tests/src/texture/hipTextureObj2D.cpp +++ b/hipamd/tests/src/texture/hipTextureObj2D.cpp @@ -17,8 +17,8 @@ __global__ void tex2DKernel(float* outputData, int width, int height) { - int x = blockIdx.x*blockDim.x + threadIdx.x; - int y = blockIdx.y*blockDim.y + threadIdx.y; + int x = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x; + int y = hipBlockIdx_y*hipBlockDim_y + hipThreadIdx_y; outputData[y*width + x] = tex2D(textureObject, x, y); } diff --git a/hipamd/tests/src/texture/hipTextureRef2D.cpp b/hipamd/tests/src/texture/hipTextureRef2D.cpp index c42f09d5a0..ebc7a04385 100644 --- a/hipamd/tests/src/texture/hipTextureRef2D.cpp +++ b/hipamd/tests/src/texture/hipTextureRef2D.cpp @@ -20,8 +20,8 @@ __global__ void tex2DKernel(float* outputData, int width, int height) { - int x = blockIdx.x*blockDim.x + threadIdx.x; - int y = blockIdx.y*blockDim.y + threadIdx.y; + int x = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x; + int y = hipBlockIdx_y*hipBlockDim_y + hipThreadIdx_y; #ifdef __HIP_PLATFORM_HCC__ outputData[y*width + x] = tex2D(tex, textureObject, x, y); #else From 2557000b5657d20d87f2b4ceac14597e2016443e Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 29 Nov 2017 21:36:29 +0000 Subject: [PATCH 23/27] Revert "Revert adoption of CUDA indexing in general - this can only work with later versions of the compiler, just like module based dispatch, and thus must be guarded against usage in earlier (e.g. 1.6) versions." This reverts commit d2fd1f5 --- hipamd/samples/0_Intro/square/square.cu | 4 +-- hipamd/src/device_util.cpp | 4 +-- hipamd/src/hip_memory.cpp | 4 +-- .../device/hipFuncDeviceSynchronize.cpp | 2 +- hipamd/tests/src/deviceLib/hipComplex.cpp | 2 +- .../tests/src/deviceLib/hipDeviceMemcpy.cpp | 4 +-- hipamd/tests/src/deviceLib/hipFloatMath.cpp | 2 +- .../src/deviceLib/hipSimpleAtomicsTest.cpp | 2 +- hipamd/tests/src/deviceLib/hipTestDevice.cpp | 32 +++++++++---------- .../src/deviceLib/hipTestDeviceDouble.cpp | 28 ++++++++-------- .../src/deviceLib/hipTestDeviceSymbol.cpp | 2 +- hipamd/tests/src/deviceLib/hipTestHalf.cpp | 4 +-- hipamd/tests/src/deviceLib/hipThreadFence.cpp | 2 +- hipamd/tests/src/deviceLib/hip_anyall.cpp | 6 ++-- hipamd/tests/src/deviceLib/hip_ballot.cpp | 8 ++--- hipamd/tests/src/deviceLib/hip_brev.cpp | 4 +-- hipamd/tests/src/deviceLib/hip_clz.cpp | 4 +-- hipamd/tests/src/deviceLib/hip_ffs.cpp | 4 +-- hipamd/tests/src/deviceLib/hip_popc.cpp | 4 +-- hipamd/tests/src/deviceLib/hip_test_ldg.cpp | 4 +-- .../src/deviceLib/hip_test_make_type.cpp | 20 ++++++------ hipamd/tests/src/deviceLib/hip_trig.cpp | 2 +- hipamd/tests/src/experimental/xcompile/hHip.c | 2 +- .../src/experimental/xcompile/hipxxKer.cpp | 2 +- .../src/experimental/xcompile/hxxHip.cpp | 2 +- hipamd/tests/src/hipC.c | 2 +- hipamd/tests/src/hipC.cpp | 2 +- hipamd/tests/src/hipCKernel.c | 2 +- hipamd/tests/src/kernel/hipDynamicShared.cpp | 4 +-- hipamd/tests/src/kernel/hipDynamicShared2.cpp | 2 +- hipamd/tests/src/kernel/hipGridLaunch.cpp | 4 +-- .../src/kernel/hipLanguageExtensions.cpp | 8 ++--- hipamd/tests/src/kernel/hipTestConstant.cpp | 2 +- .../tests/src/kernel/hipTestMallocKernel.cpp | 4 +-- hipamd/tests/src/kernel/hipTestMemKernel.cpp | 20 ++++++------ hipamd/tests/src/kernel/inline_asm_vadd.cpp | 2 +- hipamd/tests/src/kernel/inline_asm_vmac.cpp | 2 +- hipamd/tests/src/kernel/launch_bounds.cpp | 2 +- .../device/hipDeviceSynchronize.cpp | 2 +- .../src/runtimeApi/memory/hipHostGetFlags.cpp | 2 +- .../src/runtimeApi/memory/hipHostMalloc.cpp | 4 +-- .../src/runtimeApi/memory/hipHostRegister.cpp | 2 +- .../src/runtimeApi/memory/hipMemcpyAsync.cpp | 4 +-- .../memory/hipMemoryAllocateCoherent.cpp | 2 +- .../runtimeApi/memory/p2p_copy_coherency.cpp | 8 ++--- .../tests/src/runtimeApi/module/hipModule.cpp | 2 +- .../src/runtimeApi/module/vcpy_kernel.cpp | 2 +- .../multiThread/hipMultiThreadStreams2.cpp | 2 +- .../runtimeApi/stream/hipAPIStreamDisable.cpp | 4 +-- .../runtimeApi/stream/hipAPIStreamEnable.cpp | 4 +-- .../src/runtimeApi/stream/hipNullStream.cpp | 4 +-- .../tests/src/runtimeApi/stream/hipStream.h | 2 +- .../synchronization/copy_coherency.cpp | 8 ++--- .../synchronization/memcpyInt.device.cpp | 4 +-- hipamd/tests/src/specialFunc.cu | 2 +- hipamd/tests/src/stress/hipStressAsync.cpp | 2 +- hipamd/tests/src/stress/hipStressChain.cpp | 2 +- hipamd/tests/src/stress/hipStressKernel.cpp | 2 +- hipamd/tests/src/stress/hipStressSync.cpp | 2 +- hipamd/tests/src/test_common.h | 20 ++++++------ hipamd/tests/src/texture/hipTextureObj2D.cpp | 4 +-- hipamd/tests/src/texture/hipTextureRef2D.cpp | 4 +-- 62 files changed, 152 insertions(+), 152 deletions(-) diff --git a/hipamd/samples/0_Intro/square/square.cu b/hipamd/samples/0_Intro/square/square.cu index 82b31db14a..ccaa9ae0bc 100644 --- a/hipamd/samples/0_Intro/square/square.cu +++ b/hipamd/samples/0_Intro/square/square.cu @@ -40,8 +40,8 @@ template __global__ void vector_square(T *C_d, const T *A_d, size_t N) { - size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); - size_t stride = blockDim.x * gridDim.x ; + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; for (size_t i=offset; i(&f[idx]), diff --git a/hipamd/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp b/hipamd/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp index c8c2e644c3..dac56bf709 100644 --- a/hipamd/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp +++ b/hipamd/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp @@ -34,7 +34,7 @@ THE SOFTWARE. #define NUM_STREAMS 2 __global__ void Iter(hipLaunchParm lp, int *Ad, int num){ - int tx = threadIdx.x + blockIdx.x * blockDim.x; + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; // Kernel loop designed to execute very slowly... ... ... so we can test timing-related behavior below if(tx == 0){ for(int i = 0; i>pshift] = __any(tid -77); - device_all[threadIdx.x>>pshift] = __all(tid -77); + int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + device_any[hipThreadIdx_x>>pshift] = __any(tid -77); + device_all[hipThreadIdx_x>>pshift] = __all(tid -77); } int main(int argc, char *argv[]) diff --git a/hipamd/tests/src/deviceLib/hip_ballot.cpp b/hipamd/tests/src/deviceLib/hip_ballot.cpp index 14b8f314a1..742c47a065 100644 --- a/hipamd/tests/src/deviceLib/hip_ballot.cpp +++ b/hipamd/tests/src/deviceLib/hip_ballot.cpp @@ -34,12 +34,12 @@ __global__ void gpu_ballot(hipLaunchParm lp, unsigned int* device_ballot, int Num_Warps_per_Block,int pshift) { - int tid = threadIdx.x + blockIdx.x * blockDim.x; - const unsigned int warp_num = threadIdx.x >> pshift; + int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + const unsigned int warp_num = hipThreadIdx_x >> pshift; #ifdef __HIP_PLATFORM_HCC__ - atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popcll(__ballot(tid - 245))); + atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popcll(__ballot(tid - 245))); #else - atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popc(__ballot(tid - 245))); + atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popc(__ballot(tid - 245))); #endif } diff --git a/hipamd/tests/src/deviceLib/hip_brev.cpp b/hipamd/tests/src/deviceLib/hip_brev.cpp index c08c39dec9..855a8bec47 100644 --- a/hipamd/tests/src/deviceLib/hip_brev.cpp +++ b/hipamd/tests/src/deviceLib/hip_brev.cpp @@ -72,8 +72,8 @@ HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b,unsigned long long int* c, unsigned long long int* d, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { diff --git a/hipamd/tests/src/deviceLib/hip_clz.cpp b/hipamd/tests/src/deviceLib/hip_clz.cpp index 53fd611184..bdb31f3e8d 100644 --- a/hipamd/tests/src/deviceLib/hip_clz.cpp +++ b/hipamd/tests/src/deviceLib/hip_clz.cpp @@ -83,8 +83,8 @@ HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b,unsigned int* c, unsigned long long int* d, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { diff --git a/hipamd/tests/src/deviceLib/hip_ffs.cpp b/hipamd/tests/src/deviceLib/hip_ffs.cpp index 49530bb298..c855ede060 100644 --- a/hipamd/tests/src/deviceLib/hip_ffs.cpp +++ b/hipamd/tests/src/deviceLib/hip_ffs.cpp @@ -73,8 +73,8 @@ HIP_kernel(hipLaunchParm lp, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { diff --git a/hipamd/tests/src/deviceLib/hip_popc.cpp b/hipamd/tests/src/deviceLib/hip_popc.cpp index 19dafb4d43..e503e55b42 100644 --- a/hipamd/tests/src/deviceLib/hip_popc.cpp +++ b/hipamd/tests/src/deviceLib/hip_popc.cpp @@ -64,8 +64,8 @@ HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b,unsigned int* c, unsigned long long int* d, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { diff --git a/hipamd/tests/src/deviceLib/hip_test_ldg.cpp b/hipamd/tests/src/deviceLib/hip_test_ldg.cpp index 4db522cc10..5540c4917d 100644 --- a/hipamd/tests/src/deviceLib/hip_test_ldg.cpp +++ b/hipamd/tests/src/deviceLib/hip_test_ldg.cpp @@ -57,8 +57,8 @@ vectoradd_float(hipLaunchParm lp, T* a, const T* bm, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { diff --git a/hipamd/tests/src/deviceLib/hip_test_make_type.cpp b/hipamd/tests/src/deviceLib/hip_test_make_type.cpp index 6eba236e12..ce689ceb89 100644 --- a/hipamd/tests/src/deviceLib/hip_test_make_type.cpp +++ b/hipamd/tests/src/deviceLib/hip_test_make_type.cpp @@ -45,8 +45,8 @@ vectoradd_char1(hipLaunchParm lp, char1* a, const char1* bm, const char1* cm, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { @@ -59,8 +59,8 @@ vectoradd_char2(hipLaunchParm lp, char2* a, const char2* bm, const char2* cm, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { @@ -73,8 +73,8 @@ vectoradd_char3(hipLaunchParm lp, char3* a, const char3* bm, const char3* cm, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { @@ -86,8 +86,8 @@ vectoradd_char4(hipLaunchParm lp, char4* a, const char4* bm, const char4* cm, int width, int height) { - int x = blockDim.x * blockIdx.x + threadIdx.x; - int y = blockDim.y * blockIdx.y + threadIdx.y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { @@ -100,8 +100,8 @@ vectoradd_char4(hipLaunchParm lp, __kernel__ void vectoradd_float(float* a, const float* b, const float* c, int width, int height) { - int x = blockDimX * blockIdx.x + threadIdx.x; - int y = blockDimY * blockIdy.y + threadIdx.y; + int x = blockDimX * hipBlockIdx_x + hipThreadIdx_x; + int y = blockDimY * blockIdy.y + hipThreadIdx_y; int i = y * width + x; if ( i < (width * height)) { diff --git a/hipamd/tests/src/deviceLib/hip_trig.cpp b/hipamd/tests/src/deviceLib/hip_trig.cpp index 6ee8dc58ad..5ec28101f3 100644 --- a/hipamd/tests/src/deviceLib/hip_trig.cpp +++ b/hipamd/tests/src/deviceLib/hip_trig.cpp @@ -36,7 +36,7 @@ THE SOFTWARE. #define SIZE LEN<<2 __global__ void kernel_trig(hipLaunchParm lp, float *In, float *sin_d, float *cos_d, float *tan_d, float *sin_pd, float *cos_pd){ - int tid = threadIdx.x + blockIdx.x * blockDim.x; + int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; sin_d[tid] = __sinf(In[tid]); cos_d[tid] = __cosf(In[tid]); tan_d[tid] = __tanf(In[tid]); diff --git a/hipamd/tests/src/experimental/xcompile/hHip.c b/hipamd/tests/src/experimental/xcompile/hHip.c index 17e7e9ecf6..2ac4ebc73e 100644 --- a/hipamd/tests/src/experimental/xcompile/hHip.c +++ b/hipamd/tests/src/experimental/xcompile/hHip.c @@ -29,7 +29,7 @@ THE SOFTWARE. __global__ void Add(hipLaunchParm lp, float *Ad, float *Bd, float *Cd, size_t len) { - int tx = threadIdx.x + blockIdx.x * blockDim.x; + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; if(tx < len) { Cd[tx] = Ad[tx] + Bd[tx]; diff --git a/hipamd/tests/src/experimental/xcompile/hipxxKer.cpp b/hipamd/tests/src/experimental/xcompile/hipxxKer.cpp index 5dca6c1bca..d1bbed63cd 100644 --- a/hipamd/tests/src/experimental/xcompile/hipxxKer.cpp +++ b/hipamd/tests/src/experimental/xcompile/hipxxKer.cpp @@ -30,7 +30,7 @@ THE SOFTWARE. __global__ void Kern(hipLaunchParm lp, float *A) { - int tx = threadIdx.x + blockIdx.x * blockDim.x; + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; A[tx] += 1.0f; } diff --git a/hipamd/tests/src/experimental/xcompile/hxxHip.cpp b/hipamd/tests/src/experimental/xcompile/hxxHip.cpp index bca5d64afc..6a748d5c89 100644 --- a/hipamd/tests/src/experimental/xcompile/hxxHip.cpp +++ b/hipamd/tests/src/experimental/xcompile/hxxHip.cpp @@ -33,7 +33,7 @@ class memManager; template __global__ void Add(hipLaunchParm lp, T* Ad, T* Bd, T* Cd, size_t Len) { - int tx = threadIdx.x + blockIdx.x * blockDim.x; + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; if(tx < Len) { Cd[tx] = Ad[tx] + Bd[tx]; diff --git a/hipamd/tests/src/hipC.c b/hipamd/tests/src/hipC.c index efa03bb909..644df6c98f 100644 --- a/hipamd/tests/src/hipC.c +++ b/hipamd/tests/src/hipC.c @@ -34,7 +34,7 @@ THE SOFTWARE. #define SIZE 1024*1024*sizeof(int) __global__ void Iter(hipLaunchParm lp, int *Ad){ - int tx = threadIdx.x + blockIdx.x * blockDim.x; + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; if(tx == 0){ for(int i=0;i(my_sdata); #endif - size_t gid = (blockIdx.x * blockDim.x + threadIdx.x); - size_t tid = threadIdx.x; + size_t gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t tid = hipThreadIdx_x; // initialize dynamic shared memory if (tid < groupElements) { diff --git a/hipamd/tests/src/kernel/hipDynamicShared2.cpp b/hipamd/tests/src/kernel/hipDynamicShared2.cpp index 4567ff6fc2..95e70a9956 100644 --- a/hipamd/tests/src/kernel/hipDynamicShared2.cpp +++ b/hipamd/tests/src/kernel/hipDynamicShared2.cpp @@ -34,7 +34,7 @@ THE SOFTWARE. __global__ void vectorAdd(hipLaunchParm lp, float *Ad, float *Bd) { HIP_DYNAMIC_SHARED(float, sBd); - int tx = threadIdx.x; + int tx = hipThreadIdx_x; for(int i=0;i __global__ void Inc(hipLaunchParm lp, float *Ad){ - int tx = threadIdx.x + blockIdx.x * blockDim.x; + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; Ad[tx] = Ad[tx] + float(1); } diff --git a/hipamd/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp b/hipamd/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp index 5cd46c808a..c4f4b23dc0 100644 --- a/hipamd/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp +++ b/hipamd/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp @@ -70,8 +70,8 @@ template __global__ void addK (hipLaunchParm lp, T *A, T K, size_t numElements) { - size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); - size_t stride = blockDim.x * gridDim.x ; + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; for (size_t i=offset; i __global__ void Inc(hipLaunchParm lp, T *Array){ -int tx = threadIdx.x + blockIdx.x * blockDim.x; +int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; Array[tx] = Array[tx] + T(1); } diff --git a/hipamd/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp b/hipamd/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp index 66b93a164f..4e343121ed 100644 --- a/hipamd/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp +++ b/hipamd/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp @@ -29,7 +29,7 @@ THE SOFTWARE. const int NN = 1 << 21; __global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){ - int tid = threadIdx.x; + int tid = hipThreadIdx_x; if(tid < 1){ for(int i=0;i __global__ void Inc(hipLaunchParm lp, T *In){ -int tx = threadIdx.x + blockIdx.x * blockDim.x; +int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; In[tx] = In[tx] + 1; } diff --git a/hipamd/tests/src/runtimeApi/synchronization/copy_coherency.cpp b/hipamd/tests/src/runtimeApi/synchronization/copy_coherency.cpp index b2a66f61e2..e4bfb98206 100644 --- a/hipamd/tests/src/runtimeApi/synchronization/copy_coherency.cpp +++ b/hipamd/tests/src/runtimeApi/synchronization/copy_coherency.cpp @@ -102,8 +102,8 @@ MemcpyFunction g_moduleMemcpy("memcpyInt.hsaco", "memcpyIntKernel"); __global__ void memsetIntKernel(int * ptr, const int val, size_t numElements) { - int gid = (blockIdx.x * blockDim.x + threadIdx.x); - int stride = blockDim.x * gridDim.x ; + int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + int stride = hipBlockDim_x * hipGridDim_x ; for (size_t i= gid; i< numElements; i+=stride){ ptr[i] = val; } @@ -112,8 +112,8 @@ memsetIntKernel(int * ptr, const int val, size_t numElements) __global__ void memcpyIntKernel(int *dst, const int * src, size_t numElements) { - int gid = (blockIdx.x * blockDim.x + threadIdx.x); - int stride = blockDim.x * gridDim.x ; + int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + int stride = hipBlockDim_x * hipGridDim_x ; for (size_t i= gid; i< numElements; i+=stride){ dst[i] = src[i]; } diff --git a/hipamd/tests/src/runtimeApi/synchronization/memcpyInt.device.cpp b/hipamd/tests/src/runtimeApi/synchronization/memcpyInt.device.cpp index 2916d51bf9..b34d331682 100644 --- a/hipamd/tests/src/runtimeApi/synchronization/memcpyInt.device.cpp +++ b/hipamd/tests/src/runtimeApi/synchronization/memcpyInt.device.cpp @@ -5,8 +5,8 @@ extern "C" __global__ void memcpyIntKernel(hipLaunchParm lp, int *dst, const int * src, size_t numElements) { - int gid = (blockIdx.x * blockDim.x + threadIdx.x); - int stride = blockDim.x * gridDim.x ; + int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + int stride = hipBlockDim_x * hipGridDim_x ; for (size_t i= gid; i< numElements; i+=stride){ dst[i] = src[i]; } diff --git a/hipamd/tests/src/specialFunc.cu b/hipamd/tests/src/specialFunc.cu index 085be062d9..744dcd8926 100644 --- a/hipamd/tests/src/specialFunc.cu +++ b/hipamd/tests/src/specialFunc.cu @@ -23,7 +23,7 @@ THE SOFTWARE. void __global__ test_kernel(float *A) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; + int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; float a = __ballot(tid < 16); float b = __shfl(tid < 16); diff --git a/hipamd/tests/src/stress/hipStressAsync.cpp b/hipamd/tests/src/stress/hipStressAsync.cpp index a142b41730..1f8cab1a36 100644 --- a/hipamd/tests/src/stress/hipStressAsync.cpp +++ b/hipamd/tests/src/stress/hipStressAsync.cpp @@ -30,7 +30,7 @@ THE SOFTWARE. #define ITER 1<<10 __global__ void Iter(hipLaunchParm lp, int *Ad, int num){ - int tx = threadIdx.x + blockIdx.x * blockDim.x; + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; if(tx == 0){ for(int i = 0; i=0; i-=stride) { C_d[i] = A_d[i] + B_d[i]; @@ -169,8 +169,8 @@ addCount( const T *A_d, size_t NELEM, int count) { - size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); - size_t stride = blockDim.x * gridDim.x ; + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; // Deliberately do this in an inefficient way to increase kernel runtime for (int i=0; i=0; i-=stride) { C_d[i] = val; diff --git a/hipamd/tests/src/texture/hipTextureObj2D.cpp b/hipamd/tests/src/texture/hipTextureObj2D.cpp index 9ddafd6b1c..443d708418 100644 --- a/hipamd/tests/src/texture/hipTextureObj2D.cpp +++ b/hipamd/tests/src/texture/hipTextureObj2D.cpp @@ -17,8 +17,8 @@ __global__ void tex2DKernel(float* outputData, int width, int height) { - int x = blockIdx.x*blockDim.x + threadIdx.x; - int y = blockIdx.y*blockDim.y + threadIdx.y; + int x = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x; + int y = hipBlockIdx_y*hipBlockDim_y + hipThreadIdx_y; outputData[y*width + x] = tex2D(textureObject, x, y); } diff --git a/hipamd/tests/src/texture/hipTextureRef2D.cpp b/hipamd/tests/src/texture/hipTextureRef2D.cpp index c42f09d5a0..ebc7a04385 100644 --- a/hipamd/tests/src/texture/hipTextureRef2D.cpp +++ b/hipamd/tests/src/texture/hipTextureRef2D.cpp @@ -20,8 +20,8 @@ __global__ void tex2DKernel(float* outputData, int width, int height) { - int x = blockIdx.x*blockDim.x + threadIdx.x; - int y = blockIdx.y*blockDim.y + threadIdx.y; + int x = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x; + int y = hipBlockIdx_y*hipBlockDim_y + hipThreadIdx_y; #ifdef __HIP_PLATFORM_HCC__ outputData[y*width + x] = tex2D(tex, textureObject, x, y); #else From 92e80f0943b12b27ce87e4839debf0ebf82dc009 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 29 Nov 2017 21:47:04 +0000 Subject: [PATCH 24/27] Use a much simpler guard for version 1.6, which allows for direct CUDA indexing to be used. --- hipamd/include/hip/hcc_detail/host_defines.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/hipamd/include/hip/hcc_detail/host_defines.h b/hipamd/include/hip/hcc_detail/host_defines.h index 56cfa0cc0f..d600956087 100644 --- a/hipamd/include/hip/hcc_detail/host_defines.h +++ b/hipamd/include/hip/hcc_detail/host_defines.h @@ -44,8 +44,12 @@ THE SOFTWARE. #if GENERIC_GRID_LAUNCH == 0 #define __global__ __attribute__((hc_grid_launch)) __attribute__((used)) #else -#define __global__ \ - __attribute__((annotate("__HIP_global_function__"), cpu, hc, used)) + #if __hcc_workweek__ >=17481 + #define __global__ \ + __attribute__((annotate("__HIP_global_function__"), cpu, hc, used)) + #else + #define __global__ __attribute__((hc, used)) + #endif #endif //GENERIC_GRID_LAUNCH #define __noinline__ __attribute__((noinline)) From 4966518846af643083bffc1d973e7939f7dafa11 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 29 Nov 2017 21:49:10 +0000 Subject: [PATCH 25/27] Revert "Revert adoption of CUDA indexing in general - this can only work with later versions of the compiler, just like module based dispatch, and thus must be guarded against usage in earlier (e.g. 1.6) versions." This reverts commit 4792475 --- hipamd/include/hip/hcc_detail/hip_runtime.h | 40 +++++++++---------- hipamd/samples/0_Intro/square/square.cu | 4 +- hipamd/src/device_util.cpp | 4 +- hipamd/src/hip_memory.cpp | 4 +- .../device/hipFuncDeviceSynchronize.cpp | 2 +- hipamd/tests/src/deviceLib/hipComplex.cpp | 2 +- .../tests/src/deviceLib/hipDeviceMemcpy.cpp | 4 +- hipamd/tests/src/deviceLib/hipFloatMath.cpp | 2 +- .../src/deviceLib/hipSimpleAtomicsTest.cpp | 2 +- hipamd/tests/src/deviceLib/hipTestDevice.cpp | 32 +++++++-------- .../src/deviceLib/hipTestDeviceDouble.cpp | 28 ++++++------- .../src/deviceLib/hipTestDeviceSymbol.cpp | 2 +- hipamd/tests/src/deviceLib/hipTestHalf.cpp | 4 +- hipamd/tests/src/deviceLib/hipThreadFence.cpp | 2 +- hipamd/tests/src/deviceLib/hip_anyall.cpp | 6 +-- hipamd/tests/src/deviceLib/hip_ballot.cpp | 8 ++-- hipamd/tests/src/deviceLib/hip_brev.cpp | 4 +- hipamd/tests/src/deviceLib/hip_clz.cpp | 4 +- hipamd/tests/src/deviceLib/hip_ffs.cpp | 4 +- hipamd/tests/src/deviceLib/hip_popc.cpp | 4 +- hipamd/tests/src/deviceLib/hip_test_ldg.cpp | 4 +- .../src/deviceLib/hip_test_make_type.cpp | 20 +++++----- hipamd/tests/src/deviceLib/hip_trig.cpp | 2 +- hipamd/tests/src/experimental/xcompile/hHip.c | 2 +- .../src/experimental/xcompile/hipxxKer.cpp | 2 +- .../src/experimental/xcompile/hxxHip.cpp | 2 +- hipamd/tests/src/hipC.c | 2 +- hipamd/tests/src/hipC.cpp | 2 +- hipamd/tests/src/hipCKernel.c | 2 +- hipamd/tests/src/kernel/hipDynamicShared.cpp | 4 +- hipamd/tests/src/kernel/hipDynamicShared2.cpp | 2 +- hipamd/tests/src/kernel/hipGridLaunch.cpp | 4 +- .../src/kernel/hipLanguageExtensions.cpp | 8 ++-- hipamd/tests/src/kernel/hipTestConstant.cpp | 2 +- .../tests/src/kernel/hipTestMallocKernel.cpp | 4 +- hipamd/tests/src/kernel/hipTestMemKernel.cpp | 20 +++++----- hipamd/tests/src/kernel/inline_asm_vadd.cpp | 2 +- hipamd/tests/src/kernel/inline_asm_vmac.cpp | 2 +- hipamd/tests/src/kernel/launch_bounds.cpp | 2 +- .../device/hipDeviceSynchronize.cpp | 2 +- .../src/runtimeApi/memory/hipHostGetFlags.cpp | 2 +- .../src/runtimeApi/memory/hipHostMalloc.cpp | 4 +- .../src/runtimeApi/memory/hipHostRegister.cpp | 2 +- .../src/runtimeApi/memory/hipMemcpyAsync.cpp | 4 +- .../memory/hipMemoryAllocateCoherent.cpp | 2 +- .../runtimeApi/memory/p2p_copy_coherency.cpp | 8 ++-- .../tests/src/runtimeApi/module/hipModule.cpp | 2 +- .../src/runtimeApi/module/vcpy_kernel.cpp | 2 +- .../multiThread/hipMultiThreadStreams2.cpp | 2 +- .../runtimeApi/stream/hipAPIStreamDisable.cpp | 4 +- .../runtimeApi/stream/hipAPIStreamEnable.cpp | 4 +- .../src/runtimeApi/stream/hipNullStream.cpp | 4 +- .../tests/src/runtimeApi/stream/hipStream.h | 2 +- .../synchronization/copy_coherency.cpp | 8 ++-- .../synchronization/memcpyInt.device.cpp | 4 +- hipamd/tests/src/specialFunc.cu | 2 +- hipamd/tests/src/stress/hipStressAsync.cpp | 2 +- hipamd/tests/src/stress/hipStressChain.cpp | 2 +- hipamd/tests/src/stress/hipStressKernel.cpp | 2 +- hipamd/tests/src/stress/hipStressSync.cpp | 2 +- hipamd/tests/src/test_common.h | 20 +++++----- hipamd/tests/src/texture/hipTextureObj2D.cpp | 4 +- hipamd/tests/src/texture/hipTextureRef2D.cpp | 4 +- 63 files changed, 171 insertions(+), 173 deletions(-) diff --git a/hipamd/include/hip/hcc_detail/hip_runtime.h b/hipamd/include/hip/hcc_detail/hip_runtime.h index 944f74864b..924e774af0 100644 --- a/hipamd/include/hip/hcc_detail/hip_runtime.h +++ b/hipamd/include/hip/hcc_detail/hip_runtime.h @@ -381,29 +381,27 @@ __device__ void __threadfence_system(void) ; * @} */ -#if __hcc_workweek__ >= 17481 - template::type f> - class Coordinates { - using R = decltype(f(0)); +template::type f> +class Coordinates { + using R = decltype(f(0)); - struct X { __device__ operator R() const { return f(0); } }; - struct Y { __device__ operator R() const { return f(1); } }; - struct Z { __device__ operator R() const { return f(2); } }; - public: - static constexpr X x{}; - static constexpr Y y{}; - static constexpr Z z{}; - }; + struct X { __device__ operator R() const { return f(0); } }; + struct Y { __device__ operator R() const { return f(1); } }; + struct Z { __device__ operator R() const { return f(2); } }; +public: + static constexpr X x{}; + static constexpr Y y{}; + static constexpr Z z{}; +}; - static constexpr Coordinates blockDim; - static constexpr Coordinates blockIdx; - static constexpr Coordinates gridDim; - static constexpr Coordinates threadIdx; -#endif +static constexpr Coordinates blockDim; +static constexpr Coordinates blockIdx; +static constexpr Coordinates gridDim; +static constexpr Coordinates threadIdx; #define hipThreadIdx_x (hc_get_workitem_id(0)) #define hipThreadIdx_y (hc_get_workitem_id(1)) diff --git a/hipamd/samples/0_Intro/square/square.cu b/hipamd/samples/0_Intro/square/square.cu index ccaa9ae0bc..82b31db14a 100644 --- a/hipamd/samples/0_Intro/square/square.cu +++ b/hipamd/samples/0_Intro/square/square.cu @@ -40,8 +40,8 @@ template __global__ void vector_square(T *C_d, const T *A_d, size_t N) { - size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - size_t stride = hipBlockDim_x * hipGridDim_x ; + size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x ; for (size_t i=offset; i(&f[idx]), diff --git a/hipamd/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp b/hipamd/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp index dac56bf709..c8c2e644c3 100644 --- a/hipamd/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp +++ b/hipamd/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp @@ -34,7 +34,7 @@ THE SOFTWARE. #define NUM_STREAMS 2 __global__ void Iter(hipLaunchParm lp, int *Ad, int num){ - int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tx = threadIdx.x + blockIdx.x * blockDim.x; // Kernel loop designed to execute very slowly... ... ... so we can test timing-related behavior below if(tx == 0){ for(int i = 0; i>pshift] = __any(tid -77); - device_all[hipThreadIdx_x>>pshift] = __all(tid -77); + int tid = threadIdx.x + blockIdx.x * blockDim.x; + device_any[threadIdx.x>>pshift] = __any(tid -77); + device_all[threadIdx.x>>pshift] = __all(tid -77); } int main(int argc, char *argv[]) diff --git a/hipamd/tests/src/deviceLib/hip_ballot.cpp b/hipamd/tests/src/deviceLib/hip_ballot.cpp index 742c47a065..14b8f314a1 100644 --- a/hipamd/tests/src/deviceLib/hip_ballot.cpp +++ b/hipamd/tests/src/deviceLib/hip_ballot.cpp @@ -34,12 +34,12 @@ __global__ void gpu_ballot(hipLaunchParm lp, unsigned int* device_ballot, int Num_Warps_per_Block,int pshift) { - int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; - const unsigned int warp_num = hipThreadIdx_x >> pshift; + int tid = threadIdx.x + blockIdx.x * blockDim.x; + const unsigned int warp_num = threadIdx.x >> pshift; #ifdef __HIP_PLATFORM_HCC__ - atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popcll(__ballot(tid - 245))); + atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popcll(__ballot(tid - 245))); #else - atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popc(__ballot(tid - 245))); + atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popc(__ballot(tid - 245))); #endif } diff --git a/hipamd/tests/src/deviceLib/hip_brev.cpp b/hipamd/tests/src/deviceLib/hip_brev.cpp index 855a8bec47..c08c39dec9 100644 --- a/hipamd/tests/src/deviceLib/hip_brev.cpp +++ b/hipamd/tests/src/deviceLib/hip_brev.cpp @@ -72,8 +72,8 @@ HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b,unsigned long long int* c, unsigned long long int* d, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { diff --git a/hipamd/tests/src/deviceLib/hip_clz.cpp b/hipamd/tests/src/deviceLib/hip_clz.cpp index bdb31f3e8d..53fd611184 100644 --- a/hipamd/tests/src/deviceLib/hip_clz.cpp +++ b/hipamd/tests/src/deviceLib/hip_clz.cpp @@ -83,8 +83,8 @@ HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b,unsigned int* c, unsigned long long int* d, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { diff --git a/hipamd/tests/src/deviceLib/hip_ffs.cpp b/hipamd/tests/src/deviceLib/hip_ffs.cpp index c855ede060..49530bb298 100644 --- a/hipamd/tests/src/deviceLib/hip_ffs.cpp +++ b/hipamd/tests/src/deviceLib/hip_ffs.cpp @@ -73,8 +73,8 @@ HIP_kernel(hipLaunchParm lp, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { diff --git a/hipamd/tests/src/deviceLib/hip_popc.cpp b/hipamd/tests/src/deviceLib/hip_popc.cpp index e503e55b42..19dafb4d43 100644 --- a/hipamd/tests/src/deviceLib/hip_popc.cpp +++ b/hipamd/tests/src/deviceLib/hip_popc.cpp @@ -64,8 +64,8 @@ HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b,unsigned int* c, unsigned long long int* d, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { diff --git a/hipamd/tests/src/deviceLib/hip_test_ldg.cpp b/hipamd/tests/src/deviceLib/hip_test_ldg.cpp index 5540c4917d..4db522cc10 100644 --- a/hipamd/tests/src/deviceLib/hip_test_ldg.cpp +++ b/hipamd/tests/src/deviceLib/hip_test_ldg.cpp @@ -57,8 +57,8 @@ vectoradd_float(hipLaunchParm lp, T* a, const T* bm, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { diff --git a/hipamd/tests/src/deviceLib/hip_test_make_type.cpp b/hipamd/tests/src/deviceLib/hip_test_make_type.cpp index ce689ceb89..6eba236e12 100644 --- a/hipamd/tests/src/deviceLib/hip_test_make_type.cpp +++ b/hipamd/tests/src/deviceLib/hip_test_make_type.cpp @@ -45,8 +45,8 @@ vectoradd_char1(hipLaunchParm lp, char1* a, const char1* bm, const char1* cm, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { @@ -59,8 +59,8 @@ vectoradd_char2(hipLaunchParm lp, char2* a, const char2* bm, const char2* cm, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { @@ -73,8 +73,8 @@ vectoradd_char3(hipLaunchParm lp, char3* a, const char3* bm, const char3* cm, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { @@ -86,8 +86,8 @@ vectoradd_char4(hipLaunchParm lp, char4* a, const char4* bm, const char4* cm, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { @@ -100,8 +100,8 @@ vectoradd_char4(hipLaunchParm lp, __kernel__ void vectoradd_float(float* a, const float* b, const float* c, int width, int height) { - int x = blockDimX * hipBlockIdx_x + hipThreadIdx_x; - int y = blockDimY * blockIdy.y + hipThreadIdx_y; + int x = blockDimX * blockIdx.x + threadIdx.x; + int y = blockDimY * blockIdy.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { diff --git a/hipamd/tests/src/deviceLib/hip_trig.cpp b/hipamd/tests/src/deviceLib/hip_trig.cpp index 5ec28101f3..6ee8dc58ad 100644 --- a/hipamd/tests/src/deviceLib/hip_trig.cpp +++ b/hipamd/tests/src/deviceLib/hip_trig.cpp @@ -36,7 +36,7 @@ THE SOFTWARE. #define SIZE LEN<<2 __global__ void kernel_trig(hipLaunchParm lp, float *In, float *sin_d, float *cos_d, float *tan_d, float *sin_pd, float *cos_pd){ - int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tid = threadIdx.x + blockIdx.x * blockDim.x; sin_d[tid] = __sinf(In[tid]); cos_d[tid] = __cosf(In[tid]); tan_d[tid] = __tanf(In[tid]); diff --git a/hipamd/tests/src/experimental/xcompile/hHip.c b/hipamd/tests/src/experimental/xcompile/hHip.c index 2ac4ebc73e..17e7e9ecf6 100644 --- a/hipamd/tests/src/experimental/xcompile/hHip.c +++ b/hipamd/tests/src/experimental/xcompile/hHip.c @@ -29,7 +29,7 @@ THE SOFTWARE. __global__ void Add(hipLaunchParm lp, float *Ad, float *Bd, float *Cd, size_t len) { - int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tx = threadIdx.x + blockIdx.x * blockDim.x; if(tx < len) { Cd[tx] = Ad[tx] + Bd[tx]; diff --git a/hipamd/tests/src/experimental/xcompile/hipxxKer.cpp b/hipamd/tests/src/experimental/xcompile/hipxxKer.cpp index d1bbed63cd..5dca6c1bca 100644 --- a/hipamd/tests/src/experimental/xcompile/hipxxKer.cpp +++ b/hipamd/tests/src/experimental/xcompile/hipxxKer.cpp @@ -30,7 +30,7 @@ THE SOFTWARE. __global__ void Kern(hipLaunchParm lp, float *A) { - int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tx = threadIdx.x + blockIdx.x * blockDim.x; A[tx] += 1.0f; } diff --git a/hipamd/tests/src/experimental/xcompile/hxxHip.cpp b/hipamd/tests/src/experimental/xcompile/hxxHip.cpp index 6a748d5c89..bca5d64afc 100644 --- a/hipamd/tests/src/experimental/xcompile/hxxHip.cpp +++ b/hipamd/tests/src/experimental/xcompile/hxxHip.cpp @@ -33,7 +33,7 @@ class memManager; template __global__ void Add(hipLaunchParm lp, T* Ad, T* Bd, T* Cd, size_t Len) { - int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tx = threadIdx.x + blockIdx.x * blockDim.x; if(tx < Len) { Cd[tx] = Ad[tx] + Bd[tx]; diff --git a/hipamd/tests/src/hipC.c b/hipamd/tests/src/hipC.c index 644df6c98f..efa03bb909 100644 --- a/hipamd/tests/src/hipC.c +++ b/hipamd/tests/src/hipC.c @@ -34,7 +34,7 @@ THE SOFTWARE. #define SIZE 1024*1024*sizeof(int) __global__ void Iter(hipLaunchParm lp, int *Ad){ - int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tx = threadIdx.x + blockIdx.x * blockDim.x; if(tx == 0){ for(int i=0;i(my_sdata); #endif - size_t gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - size_t tid = hipThreadIdx_x; + size_t gid = (blockIdx.x * blockDim.x + threadIdx.x); + size_t tid = threadIdx.x; // initialize dynamic shared memory if (tid < groupElements) { diff --git a/hipamd/tests/src/kernel/hipDynamicShared2.cpp b/hipamd/tests/src/kernel/hipDynamicShared2.cpp index 95e70a9956..4567ff6fc2 100644 --- a/hipamd/tests/src/kernel/hipDynamicShared2.cpp +++ b/hipamd/tests/src/kernel/hipDynamicShared2.cpp @@ -34,7 +34,7 @@ THE SOFTWARE. __global__ void vectorAdd(hipLaunchParm lp, float *Ad, float *Bd) { HIP_DYNAMIC_SHARED(float, sBd); - int tx = hipThreadIdx_x; + int tx = threadIdx.x; for(int i=0;i __global__ void Inc(hipLaunchParm lp, float *Ad){ - int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tx = threadIdx.x + blockIdx.x * blockDim.x; Ad[tx] = Ad[tx] + float(1); } diff --git a/hipamd/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp b/hipamd/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp index c4f4b23dc0..5cd46c808a 100644 --- a/hipamd/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp +++ b/hipamd/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp @@ -70,8 +70,8 @@ template __global__ void addK (hipLaunchParm lp, T *A, T K, size_t numElements) { - size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - size_t stride = hipBlockDim_x * hipGridDim_x ; + size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x ; for (size_t i=offset; i __global__ void Inc(hipLaunchParm lp, T *Array){ -int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; +int tx = threadIdx.x + blockIdx.x * blockDim.x; Array[tx] = Array[tx] + T(1); } diff --git a/hipamd/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp b/hipamd/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp index 4e343121ed..66b93a164f 100644 --- a/hipamd/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp +++ b/hipamd/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp @@ -29,7 +29,7 @@ THE SOFTWARE. const int NN = 1 << 21; __global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){ - int tid = hipThreadIdx_x; + int tid = threadIdx.x; if(tid < 1){ for(int i=0;i __global__ void Inc(hipLaunchParm lp, T *In){ -int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; +int tx = threadIdx.x + blockIdx.x * blockDim.x; In[tx] = In[tx] + 1; } diff --git a/hipamd/tests/src/runtimeApi/synchronization/copy_coherency.cpp b/hipamd/tests/src/runtimeApi/synchronization/copy_coherency.cpp index e4bfb98206..b2a66f61e2 100644 --- a/hipamd/tests/src/runtimeApi/synchronization/copy_coherency.cpp +++ b/hipamd/tests/src/runtimeApi/synchronization/copy_coherency.cpp @@ -102,8 +102,8 @@ MemcpyFunction g_moduleMemcpy("memcpyInt.hsaco", "memcpyIntKernel"); __global__ void memsetIntKernel(int * ptr, const int val, size_t numElements) { - int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - int stride = hipBlockDim_x * hipGridDim_x ; + int gid = (blockIdx.x * blockDim.x + threadIdx.x); + int stride = blockDim.x * gridDim.x ; for (size_t i= gid; i< numElements; i+=stride){ ptr[i] = val; } @@ -112,8 +112,8 @@ memsetIntKernel(int * ptr, const int val, size_t numElements) __global__ void memcpyIntKernel(int *dst, const int * src, size_t numElements) { - int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - int stride = hipBlockDim_x * hipGridDim_x ; + int gid = (blockIdx.x * blockDim.x + threadIdx.x); + int stride = blockDim.x * gridDim.x ; for (size_t i= gid; i< numElements; i+=stride){ dst[i] = src[i]; } diff --git a/hipamd/tests/src/runtimeApi/synchronization/memcpyInt.device.cpp b/hipamd/tests/src/runtimeApi/synchronization/memcpyInt.device.cpp index b34d331682..2916d51bf9 100644 --- a/hipamd/tests/src/runtimeApi/synchronization/memcpyInt.device.cpp +++ b/hipamd/tests/src/runtimeApi/synchronization/memcpyInt.device.cpp @@ -5,8 +5,8 @@ extern "C" __global__ void memcpyIntKernel(hipLaunchParm lp, int *dst, const int * src, size_t numElements) { - int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - int stride = hipBlockDim_x * hipGridDim_x ; + int gid = (blockIdx.x * blockDim.x + threadIdx.x); + int stride = blockDim.x * gridDim.x ; for (size_t i= gid; i< numElements; i+=stride){ dst[i] = src[i]; } diff --git a/hipamd/tests/src/specialFunc.cu b/hipamd/tests/src/specialFunc.cu index 744dcd8926..085be062d9 100644 --- a/hipamd/tests/src/specialFunc.cu +++ b/hipamd/tests/src/specialFunc.cu @@ -23,7 +23,7 @@ THE SOFTWARE. void __global__ test_kernel(float *A) { - int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int tid = blockIdx.x * blockDim.x + threadIdx.x; float a = __ballot(tid < 16); float b = __shfl(tid < 16); diff --git a/hipamd/tests/src/stress/hipStressAsync.cpp b/hipamd/tests/src/stress/hipStressAsync.cpp index 1f8cab1a36..a142b41730 100644 --- a/hipamd/tests/src/stress/hipStressAsync.cpp +++ b/hipamd/tests/src/stress/hipStressAsync.cpp @@ -30,7 +30,7 @@ THE SOFTWARE. #define ITER 1<<10 __global__ void Iter(hipLaunchParm lp, int *Ad, int num){ - int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tx = threadIdx.x + blockIdx.x * blockDim.x; if(tx == 0){ for(int i = 0; i=0; i-=stride) { C_d[i] = A_d[i] + B_d[i]; @@ -169,8 +169,8 @@ addCount( const T *A_d, size_t NELEM, int count) { - size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - size_t stride = hipBlockDim_x * hipGridDim_x ; + size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x ; // Deliberately do this in an inefficient way to increase kernel runtime for (int i=0; i=0; i-=stride) { C_d[i] = val; diff --git a/hipamd/tests/src/texture/hipTextureObj2D.cpp b/hipamd/tests/src/texture/hipTextureObj2D.cpp index 443d708418..9ddafd6b1c 100644 --- a/hipamd/tests/src/texture/hipTextureObj2D.cpp +++ b/hipamd/tests/src/texture/hipTextureObj2D.cpp @@ -17,8 +17,8 @@ __global__ void tex2DKernel(float* outputData, int width, int height) { - int x = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x; - int y = hipBlockIdx_y*hipBlockDim_y + hipThreadIdx_y; + int x = blockIdx.x*blockDim.x + threadIdx.x; + int y = blockIdx.y*blockDim.y + threadIdx.y; outputData[y*width + x] = tex2D(textureObject, x, y); } diff --git a/hipamd/tests/src/texture/hipTextureRef2D.cpp b/hipamd/tests/src/texture/hipTextureRef2D.cpp index ebc7a04385..c42f09d5a0 100644 --- a/hipamd/tests/src/texture/hipTextureRef2D.cpp +++ b/hipamd/tests/src/texture/hipTextureRef2D.cpp @@ -20,8 +20,8 @@ __global__ void tex2DKernel(float* outputData, int width, int height) { - int x = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x; - int y = hipBlockIdx_y*hipBlockDim_y + hipThreadIdx_y; + int x = blockIdx.x*blockDim.x + threadIdx.x; + int y = blockIdx.y*blockDim.y + threadIdx.y; #ifdef __HIP_PLATFORM_HCC__ outputData[y*width + x] = tex2D(tex, textureObject, x, y); #else From 2e395343779d1ff2ea9a4e78352bde063b45991a Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 29 Nov 2017 21:50:43 +0000 Subject: [PATCH 26/27] Add missing space (the final frontier). --- hipamd/include/hip/hcc_detail/host_defines.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hipamd/include/hip/hcc_detail/host_defines.h b/hipamd/include/hip/hcc_detail/host_defines.h index d600956087..a7acdfccf7 100644 --- a/hipamd/include/hip/hcc_detail/host_defines.h +++ b/hipamd/include/hip/hcc_detail/host_defines.h @@ -44,7 +44,7 @@ THE SOFTWARE. #if GENERIC_GRID_LAUNCH == 0 #define __global__ __attribute__((hc_grid_launch)) __attribute__((used)) #else - #if __hcc_workweek__ >=17481 + #if __hcc_workweek__ >= 17481 #define __global__ \ __attribute__((annotate("__HIP_global_function__"), cpu, hc, used)) #else From 33bb425013092b98d289d8298b12847fbef64ba1 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Thu, 30 Nov 2017 03:29:04 +0000 Subject: [PATCH 27/27] Fix legacy mode detection of the address of an agent allocated variable. In this mode, there exist two executables per each code object, one created by HCC and one created by HIP. Since we dispatch through HCC in legacy mode, we should obtain the address for an agent allocated variable from the latter's executable. Also add two omitted validity checks, whose absence could lead to segfaults when the current process had no .kernel section and / or when an invalid or empty blob was extracted from the latter. --- .../hip/hcc_detail/code_object_bundle.hpp | 2 + hipamd/src/hip_memory.cpp | 44 ++++++++++++------- hipamd/src/program_state.cpp | 2 +- 3 files changed, 31 insertions(+), 17 deletions(-) diff --git a/hipamd/include/hip/hcc_detail/code_object_bundle.hpp b/hipamd/include/hip/hcc_detail/code_object_bundle.hpp index 05ba44fcc8..72f9d35c73 100644 --- a/hipamd/include/hip/hcc_detail/code_object_bundle.hpp +++ b/hipamd/include/hip/hcc_detail/code_object_bundle.hpp @@ -76,6 +76,8 @@ namespace hip_impl RandomAccessIterator l, Bundled_code_header& x) { + if (f == l) return false; + std::copy_n(f, sizeof(x.cbuf_), x.cbuf_); if (valid(x)) { diff --git a/hipamd/src/hip_memory.cpp b/hipamd/src/hip_memory.cpp index c88a1dabc1..fb25101d7e 100644 --- a/hipamd/src/hip_memory.cpp +++ b/hipamd/src/hip_memory.cpp @@ -808,6 +808,26 @@ hipError_t hipHostUnregister(void *hostPtr) return ihipLogStatus(hip_status); } +namespace +{ + inline + hipDeviceptr_t agent_address_for_symbol(const char* symbolName) + { + hipDeviceptr_t r = nullptr; + + #if __hcc_workweek__ >= 17481 + size_t byte_cnt = 0u; + hipModuleGetGlobal(&r, &byte_cnt, 0, symbolName); + #else + auto ctx = ihipGetTlsDefaultCtx(); + auto acc = ctx->getDevice()->_acc; + r = acc.get_symbol_address(symbolName); + #endif + + return r; + } +} + hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t count, size_t offset, hipMemcpyKind kind) { HIP_INIT_SPECIAL_API((TRACE_MCMD), symbolName, src, count, offset, kind); @@ -821,10 +841,8 @@ hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t cou hc::accelerator acc = ctx->getDevice()->_acc; - hipDeviceptr_t dst = nullptr; - size_t byte_cnt = 0u; - auto status = hipModuleGetGlobal( - &dst, &byte_cnt, 0, static_cast(symbolName)); + hipDeviceptr_t dst = + agent_address_for_symbol(static_cast(symbolName)); tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst); if(dst == nullptr) @@ -859,10 +877,8 @@ hipError_t hipMemcpyFromSymbol(void* dst, const void* symbolName, size_t count, hc::accelerator acc = ctx->getDevice()->_acc; - hipDeviceptr_t src = nullptr; - size_t byte_cnt = 0u; - auto status = hipModuleGetGlobal( - &src, &byte_cnt, 0, static_cast(symbolName)); + hipDeviceptr_t src = + agent_address_for_symbol(static_cast(symbolName)); tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst); if(dst == nullptr) @@ -899,10 +915,8 @@ hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void *src, size_ hc::accelerator acc = ctx->getDevice()->_acc; - hipDeviceptr_t dst = nullptr; - size_t byte_cnt = 0u; - auto status = hipModuleGetGlobal( - &dst, &byte_cnt, 0, static_cast(symbolName)); + hipDeviceptr_t dst = + agent_address_for_symbol(static_cast(symbolName)); tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst); if(dst == nullptr) @@ -940,10 +954,8 @@ hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName, size_t co hc::accelerator acc = ctx->getDevice()->_acc; - hipDeviceptr_t src = nullptr; - size_t byte_cnt = 0u; - auto status = hipModuleGetGlobal( - &src, &byte_cnt, 0, static_cast(symbolName)); + hipDeviceptr_t src = + agent_address_for_symbol(static_cast(symbolName)); tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, src); if(src == nullptr || dst == nullptr) diff --git a/hipamd/src/program_state.cpp b/hipamd/src/program_state.cpp index 61c90556be..47071d0236 100644 --- a/hipamd/src/program_state.cpp +++ b/hipamd/src/program_state.cpp @@ -288,7 +288,7 @@ namespace return x->get_type() == SHT_SYMTAB; }); - r = function_names_for(reader, symtab); + if (symtab) r = function_names_for(reader, symtab); }); return r;