Merge 'master' into 'amd-master'

Change-Id: I1558ce7892abab30345a426d60976af9e852351d
This commit is contained in:
Jenkins
2017-12-04 04:48:51 -06:00
37 ha cambiato i file con 3019 aggiunte e 1894 eliminazioni
+3 -1
Vedi File
@@ -167,6 +167,7 @@ if(HIP_PLATFORM STREQUAL "hcc")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${HIP_HCC_BUILD_FLAGS}")
set(SOURCE_FILES_RUNTIME
src/code_object_bundle.cpp
src/hip_hcc.cpp
src/hip_context.cpp
src/hip_device.cpp
@@ -179,7 +180,8 @@ if(HIP_PLATFORM STREQUAL "hcc")
src/hip_db.cpp
src/grid_launch.cpp
src/hip_texture.cpp
src/env.cpp)
src/env.cpp
src/program_state.cpp)
set(SOURCE_FILES_DEVICE
src/device_util.cpp
@@ -0,0 +1,158 @@
/*
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include <hsa/hsa.h>
#include <algorithm>
#include <cstdint>
#include <istream>
#include <iterator>
#include <string>
#include <utility>
#include <vector>
namespace hip_impl
{
hsa_isa_t triple_to_hsa_isa(const std::string& triple);
struct Bundled_code {
union {
struct {
std::uint64_t offset;
std::uint64_t bundle_sz;
std::uint64_t triple_sz;
};
std::uint8_t cbuf[
sizeof(offset) + sizeof(bundle_sz) + sizeof(triple_sz)];
};
std::string triple;
std::vector<std::uint8_t> blob;
};
class Bundled_code_header {
// DATA - STATICS
static constexpr const char magic_string_[] =
"__CLANG_OFFLOAD_BUNDLE__";
static constexpr auto magic_string_sz_ = sizeof(magic_string_) - 1;
// DATA
union {
struct {
std::uint8_t bundler_magic_string_[magic_string_sz_];
std::uint64_t bundle_cnt_;
};
std::uint8_t cbuf_[
sizeof(bundler_magic_string_) + sizeof(bundle_cnt_)];
};
std::vector<Bundled_code> bundles_;
// FRIENDS - MANIPULATORS
template<typename RandomAccessIterator>
friend
inline
bool read(
RandomAccessIterator f,
RandomAccessIterator l,
Bundled_code_header& x)
{
if (f == l) return false;
std::copy_n(f, sizeof(x.cbuf_), x.cbuf_);
if (valid(x)) {
x.bundles_.resize(x.bundle_cnt_);
auto it = f + sizeof(x.cbuf_);
for (auto&& y : x.bundles_) {
std::copy_n(it, sizeof(y.cbuf), y.cbuf);
it += sizeof(y.cbuf);
y.triple.insert(y.triple.cend(), it, it + y.triple_sz);
std::copy_n(
f + y.offset, y.bundle_sz, std::back_inserter(y.blob));
it += y.triple_sz;
}
return true;
}
return false;
}
friend
inline
bool read(const std::vector<std::uint8_t>& blob, Bundled_code_header& x)
{
return read(blob.cbegin(), blob.cend(), x);
}
friend
inline
bool read(std::istream& is, Bundled_code_header& x)
{
return read(std::vector<std::uint8_t>{
std::istreambuf_iterator<char>{is},
std::istreambuf_iterator<char>{}},
x);
}
// FRIENDS - ACCESSORS
friend
inline
bool valid(const Bundled_code_header& x)
{
return std::equal(
x.bundler_magic_string_,
x.bundler_magic_string_ + magic_string_sz_,
x.magic_string_);
}
friend
inline
const std::vector<Bundled_code>& bundles(const Bundled_code_header& x)
{
return x.bundles_;
}
public:
// CREATORS
Bundled_code_header() = default;
template<typename RandomAccessIterator>
Bundled_code_header(RandomAccessIterator f, RandomAccessIterator l);
explicit
Bundled_code_header(const std::vector<std::uint8_t>& blob);
Bundled_code_header(const Bundled_code_header&) = default;
Bundled_code_header(Bundled_code_header&&) = default;
~Bundled_code_header() = default;
// MANIPULATORS
Bundled_code_header& operator=(const Bundled_code_header&) = default;
Bundled_code_header& operator=(Bundled_code_header&&) = default;
};
// CREATORS
template<typename I>
Bundled_code_header::Bundled_code_header(I f, I l) : Bundled_code_header{}
{
read(f, l, *this);
}
} // Namespace hip_impl.
+1 -1
Vedi File
@@ -1,5 +1,5 @@
/*
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -0,0 +1,159 @@
/*
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include "code_object_bundle.hpp"
#include "concepts.hpp"
#include "helpers.hpp"
#include "program_state.hpp"
#include "hc.hpp"
#include "hip/hip_hcc.h"
#include "hip_runtime.h"
#include <cstddef>
#include <cstdint>
#include <functional>
#include <iostream>
#include <mutex>
#include <stdexcept>
#include <string>
#include <tuple>
#include <type_traits>
#include <unordered_map>
#include <utility>
#include <vector>
namespace hip_impl
{
template<
typename T,
typename std::enable_if<std::is_integral<T>{}>::type* = nullptr>
inline
T round_up_to_next_multiple_nonnegative(T x, T y)
{
T tmp = x + y - 1;
return tmp - tmp % y;
}
inline
std::vector<std::uint8_t> make_kernarg()
{
return {};
}
inline
std::vector<std::uint8_t> make_kernarg(std::vector<std::uint8_t> kernarg)
{
return kernarg;
}
template<typename T>
inline
std::vector<std::uint8_t> make_kernarg(std::vector<uint8_t> kernarg, T x)
{
kernarg.resize(
round_up_to_next_multiple_nonnegative(kernarg.size(), alignof(T)) +
sizeof(T));
new (kernarg.data() + kernarg.size() - sizeof(T)) T{std::move(x)};
return kernarg;
}
template<typename T, typename... Ts>
inline
std::vector<std::uint8_t> make_kernarg(
std::vector<std::uint8_t> kernarg, T x, Ts... xs)
{
return make_kernarg(
make_kernarg(std::move(kernarg), std::move(x)), std::move(xs)...);
}
template<typename... Ts>
inline
std::vector<std::uint8_t> make_kernarg(Ts... xs)
{
std::vector<std::uint8_t> kernarg;
kernarg.reserve(sizeof(std::tuple<Ts...>));
return make_kernarg(std::move(kernarg), std::move(xs)...);
}
void hipLaunchKernelGGLImpl(
std::uintptr_t function_address,
const dim3& numBlocks,
const dim3& dimBlocks,
std::uint32_t sharedMemBytes,
hipStream_t stream,
void** kernarg);
} // Namespace hip_impl.
template<typename... Args, typename F = void (*)(Args...)>
inline
void hipLaunchKernelGGL(
F kernel,
const dim3& numBlocks,
const dim3& dimBlocks,
std::uint32_t sharedMemBytes,
hipStream_t stream,
Args... args)
{
auto kernarg = hip_impl::make_kernarg(std::move(args)...);
std::size_t kernarg_size = kernarg.size();
void* config[] = {
HIP_LAUNCH_PARAM_BUFFER_POINTER, kernarg.data(),
HIP_LAUNCH_PARAM_BUFFER_SIZE, &kernarg_size,
HIP_LAUNCH_PARAM_END
};
hip_impl::hipLaunchKernelGGLImpl(
reinterpret_cast<std::uintptr_t>(kernel),
numBlocks,
dimBlocks,
sharedMemBytes,
stream,
&config[0]);
}
template<typename... Args, typename F = void (*)(hipLaunchParm, Args...)>
inline
void hipLaunchKernel(
F kernel,
const dim3& numBlocks,
const dim3& dimBlocks,
std::uint32_t groupMemBytes,
hipStream_t stream,
Args... args)
{
hipLaunchKernelGGL(
kernel,
numBlocks,
dimBlocks,
groupMemBytes,
stream,
hipLaunchParm{},
std::move(args)...);
}
+6 -983
Vedi File
@@ -19,989 +19,12 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#if GENERIC_GRID_LAUNCH == 1
#include "concepts.hpp"
#include "helpers.hpp"
#include "hc.hpp"
#include "hip/hip_hcc.h"
#include "hip_runtime.h"
#include <functional>
#include <iostream>
#include <stdexcept>
#include <type_traits>
#include <utility>
namespace hip_impl
{
namespace
{
struct New_grid_launch_tag {};
struct Old_grid_launch_tag {};
template<typename C, typename D>
class RAII_guard {
D dtor_;
public:
RAII_guard() = default;
RAII_guard(const C& ctor, D dtor) : dtor_{std::move(dtor)}
{
ctor();
}
RAII_guard(const RAII_guard&) = default;
RAII_guard(RAII_guard&&) = default;
RAII_guard& operator=(const RAII_guard&) = default;
RAII_guard& operator=(RAII_guard&&) = default;
~RAII_guard() { dtor_(); }
};
template<typename C, typename D>
RAII_guard<C, D> make_RAII_guard(const C& ctor, D dtor)
{
return RAII_guard<C, D>{ctor, std::move(dtor)};
}
template<FunctionalProcedure F, typename... Ts>
using is_new_grid_launch_t = typename std::conditional<
is_callable<F(Ts...)>{},
New_grid_launch_tag,
Old_grid_launch_tag>::type;
}
// TODO: - dispatch rank should be derived from the domain dimensions passed
// in, and not always assumed to be 3;
template<FunctionalProcedure K, typename... Ts>
requires(Domain<K> == {Ts...})
inline
void grid_launch_hip_impl_(
New_grid_launch_tag,
dim3 num_blocks,
dim3 dim_blocks,
int group_mem_bytes,
const hc::accelerator_view& acc_v,
K k)
{
const auto d = hc::extent<3>{
num_blocks.z * dim_blocks.z,
num_blocks.y * dim_blocks.y,
num_blocks.x * dim_blocks.x}.tile_with_dynamic(
dim_blocks.z,
dim_blocks.y,
dim_blocks.x,
group_mem_bytes);
try {
hc::parallel_for_each(acc_v, d, k);
}
catch (std::exception& ex) {
std::cerr << "Failed in " << __func__ << ", with exception: "
<< ex.what() << std::endl;
throw;
}
}
// TODO: these are workarounds, they should be removed.
hc::accelerator_view lock_stream_hip_(hipStream_t&, void*&);
void print_prelaunch_trace_(const char*, dim3, dim3, int, hipStream_t);
void unlock_stream_hip_(
hipStream_t, void*, const char*, hc::accelerator_view*);
template<FunctionalProcedure K, typename... Ts>
requires(Domain<K> == {Ts...})
inline
void grid_launch_hip_impl_(
New_grid_launch_tag,
dim3 num_blocks,
dim3 dim_blocks,
int group_mem_bytes,
hipStream_t stream,
const char* kernel_name,
K k)
{
void* lck_stream = nullptr;
auto acc_v = lock_stream_hip_(stream, lck_stream);
auto stream_guard = make_RAII_guard(
std::bind(
print_prelaunch_trace_,
kernel_name,
num_blocks,
dim_blocks,
group_mem_bytes,
stream),
std::bind(
unlock_stream_hip_, stream, lck_stream, kernel_name, &acc_v));
try {
grid_launch_hip_impl_(
New_grid_launch_tag{},
std::move(num_blocks),
std::move(dim_blocks),
group_mem_bytes,
acc_v,
std::move(k));
}
catch (std::exception& ex) {
std::cerr << "Failed in " << __func__ << ", with exception: "
<< ex.what() << std::endl;
throw;
}
}
template<FunctionalProcedure K, typename... Ts>
requires(Domain<K> == {hipLaunchParm, Ts...})
inline
void grid_launch_hip_impl_(
Old_grid_launch_tag,
dim3 num_blocks,
dim3 dim_blocks,
int group_mem_bytes,
hipStream_t stream,
K k)
{
grid_launch_hip_impl_(
New_grid_launch_tag{},
std::move(num_blocks),
std::move(dim_blocks),
group_mem_bytes,
std::move(stream),
std::move(k));
}
template<FunctionalProcedure K, typename... Ts>
requires(Domain<K> == {hipLaunchParm, Ts...})
inline
void grid_launch_hip_impl_(
Old_grid_launch_tag,
dim3 num_blocks,
dim3 dim_blocks,
int group_mem_bytes,
hipStream_t stream,
const char* kernel_name,
K k)
{
grid_launch_hip_impl_(
New_grid_launch_tag{},
std::move(num_blocks),
std::move(dim_blocks),
group_mem_bytes,
std::move(stream),
kernel_name,
std::move(k));
}
template<FunctionalProcedure K, typename... Ts>
requires(Domain<K> == {Ts...})
inline
std::enable_if_t<!std::is_function<K>::value> grid_launch_hip_(
dim3 num_blocks,
dim3 dim_blocks,
int group_mem_bytes,
hipStream_t stream,
const char* kernel_name,
K k)
{
grid_launch_hip_impl_(
is_new_grid_launch_t<K, Ts...>{},
std::move(num_blocks),
std::move(dim_blocks),
group_mem_bytes,
std::move(stream),
kernel_name,
std::move(k));
}
template<FunctionalProcedure K, typename... Ts>
requires(Domain<K> == {Ts...})
inline
std::enable_if_t<!std::is_function<K>::value> grid_launch_hip_(
dim3 num_blocks,
dim3 dim_blocks,
int group_mem_bytes,
hipStream_t stream,
K k)
{
grid_launch_hip_impl_(
is_new_grid_launch_t<K, Ts...>{},
std::move(num_blocks),
std::move(dim_blocks),
group_mem_bytes,
std::move(stream),
std::move(k));
}
// TODO: these are temporary and purposefully noisy and disruptive.
#define make_kernel_name_hip(k, n)\
HIP_kernel_functor_name_begin ## _ ## k ## _ ## \
HIP_kernel_functor_name_end ## _ ## n
#define make_kernel_functor_hip_30(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\
p24, p25, p26, p27)\
struct make_kernel_name_hip(function_name, 28) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
std::decay_t<decltype(p17)> _p17_;\
std::decay_t<decltype(p18)> _p18_;\
std::decay_t<decltype(p19)> _p19_;\
std::decay_t<decltype(p20)> _p20_;\
std::decay_t<decltype(p21)> _p21_;\
std::decay_t<decltype(p22)> _p22_;\
std::decay_t<decltype(p23)> _p23_;\
std::decay_t<decltype(p24)> _p24_;\
std::decay_t<decltype(p25)> _p25_;\
std::decay_t<decltype(p26)> _p26_;\
std::decay_t<decltype(p27)> _p27_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
_p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_,\
_p26_, _p27_);\
}\
}
#define make_kernel_functor_hip_29(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\
p24, p25, p26)\
struct make_kernel_name_hip(function_name, 27) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
std::decay_t<decltype(p17)> _p17_;\
std::decay_t<decltype(p18)> _p18_;\
std::decay_t<decltype(p19)> _p19_;\
std::decay_t<decltype(p20)> _p20_;\
std::decay_t<decltype(p21)> _p21_;\
std::decay_t<decltype(p22)> _p22_;\
std::decay_t<decltype(p23)> _p23_;\
std::decay_t<decltype(p24)> _p24_;\
std::decay_t<decltype(p25)> _p25_;\
std::decay_t<decltype(p26)> _p26_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
_p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_,\
_p26_);\
}\
}
#define make_kernel_functor_hip_28(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\
p24, p25)\
struct make_kernel_name_hip(function_name, 26) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
std::decay_t<decltype(p17)> _p17_;\
std::decay_t<decltype(p18)> _p18_;\
std::decay_t<decltype(p19)> _p19_;\
std::decay_t<decltype(p20)> _p20_;\
std::decay_t<decltype(p21)> _p21_;\
std::decay_t<decltype(p22)> _p22_;\
std::decay_t<decltype(p23)> _p23_;\
std::decay_t<decltype(p24)> _p24_;\
std::decay_t<decltype(p25)> _p25_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
_p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_);\
}\
}
#define make_kernel_functor_hip_27(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\
p24)\
struct make_kernel_name_hip(function_name, 25) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
std::decay_t<decltype(p17)> _p17_;\
std::decay_t<decltype(p18)> _p18_;\
std::decay_t<decltype(p19)> _p19_;\
std::decay_t<decltype(p20)> _p20_;\
std::decay_t<decltype(p21)> _p21_;\
std::decay_t<decltype(p22)> _p22_;\
std::decay_t<decltype(p23)> _p23_;\
std::decay_t<decltype(p24)> _p24_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
_p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_);\
}\
}
#define make_kernel_functor_hip_26(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23)\
struct make_kernel_name_hip(function_name, 24) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
std::decay_t<decltype(p17)> _p17_;\
std::decay_t<decltype(p18)> _p18_;\
std::decay_t<decltype(p19)> _p19_;\
std::decay_t<decltype(p20)> _p20_;\
std::decay_t<decltype(p21)> _p21_;\
std::decay_t<decltype(p22)> _p22_;\
std::decay_t<decltype(p23)> _p23_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
_p18_, _p19_, _p20_, _p21_, _p22_, _p23_);\
}\
}
#define make_kernel_functor_hip_25(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22)\
struct make_kernel_name_hip(function_name, 23) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
std::decay_t<decltype(p17)> _p17_;\
std::decay_t<decltype(p18)> _p18_;\
std::decay_t<decltype(p19)> _p19_;\
std::decay_t<decltype(p20)> _p20_;\
std::decay_t<decltype(p21)> _p21_;\
std::decay_t<decltype(p22)> _p22_;\
__attribute__((used, flatten))\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
_p18_, _p19_, _p20_, _p21_, _p22_);\
}\
}
#define make_kernel_functor_hip_24(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21)\
struct make_kernel_name_hip(function_name, 22) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
std::decay_t<decltype(p17)> _p17_;\
std::decay_t<decltype(p18)> _p18_;\
std::decay_t<decltype(p19)> _p19_;\
std::decay_t<decltype(p20)> _p20_;\
std::decay_t<decltype(p21)> _p21_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
_p18_, _p19_, _p20_, _p21_);\
}\
}
#define make_kernel_functor_hip_23(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20)\
struct make_kernel_name_hip(function_name, 21) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
std::decay_t<decltype(p17)> _p17_;\
std::decay_t<decltype(p18)> _p18_;\
std::decay_t<decltype(p19)> _p19_;\
std::decay_t<decltype(p20)> _p20_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
_p18_, _p19_, _p20_);\
}\
}
#define make_kernel_functor_hip_22(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19)\
struct make_kernel_name_hip(function_name, 20) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
std::decay_t<decltype(p17)> _p17_;\
std::decay_t<decltype(p18)> _p18_;\
std::decay_t<decltype(p19)> _p19_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
_p18_, _p19_);\
}\
}
#define make_kernel_functor_hip_21(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16, p17, p18)\
struct make_kernel_name_hip(function_name, 19) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
std::decay_t<decltype(p17)> _p17_;\
std::decay_t<decltype(p18)> _p18_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
_p18_);\
}\
}
#define make_kernel_functor_hip_20(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16, p17)\
struct make_kernel_name_hip(function_name, 18) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
std::decay_t<decltype(p17)> _p17_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_);\
}\
}
#define make_kernel_functor_hip_19(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16)\
struct make_kernel_name_hip(function_name, 17) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_);\
}\
}
#define make_kernel_functor_hip_18(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15)\
struct make_kernel_name_hip(function_name, 16) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_);\
}\
}
#define make_kernel_functor_hip_17(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14)\
struct make_kernel_name_hip(function_name, 15) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_);\
}\
}
#define make_kernel_functor_hip_16(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13)\
struct make_kernel_name_hip(function_name, 14) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_);\
}\
}
#define make_kernel_functor_hip_15(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12)\
struct make_kernel_name_hip(function_name, 13) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_);\
}\
}
#define make_kernel_functor_hip_14(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11)\
struct make_kernel_name_hip(function_name, 12) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_);\
}\
}
#define make_kernel_functor_hip_13(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10)\
struct make_kernel_name_hip(function_name, 11) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_);\
}\
}
#define make_kernel_functor_hip_12(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9)\
struct make_kernel_name_hip(function_name, 10) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\
_p9_);\
}\
}
#define make_kernel_functor_hip_11(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8)\
struct make_kernel_name_hip(function_name, 9) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_);\
}\
}
#define make_kernel_functor_hip_10(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7)\
struct make_kernel_name_hip(function_name, 8) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_);\
}\
}
#define make_kernel_functor_hip_9(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6)\
struct make_kernel_name_hip(function_name, 7) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_);\
}\
}
#define make_kernel_functor_hip_8(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5)\
struct make_kernel_name_hip(function_name, 6) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_);\
}\
}
#define make_kernel_functor_hip_7(\
function_name, kernel_name, p0, p1, p2, p3, p4)\
struct make_kernel_name_hip(function_name, 5) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_);\
}\
}
#define make_kernel_functor_hip_6(function_name, kernel_name, p0, p1, p2, p3)\
struct make_kernel_name_hip(function_name, 4) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(_p0_, _p1_, _p2_, _p3_);\
}\
}
#define make_kernel_functor_hip_5(function_name, kernel_name, p0, p1, p2)\
struct make_kernel_name_hip(function_name, 3) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(_p0_, _p1_, _p2_);\
}\
}
#define make_kernel_functor_hip_4(function_name, kernel_name, p0, p1)\
struct make_kernel_name_hip(function_name, 2) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(_p0_, _p1_);\
}\
}
#define fofo(f, n) kernel_prefix_hip ## f ## kernel_suffix_hip ## n
#define make_kernel_functor_hip_3(function_name, kernel_name, p0)\
struct make_kernel_name_hip(function_name, 1) {\
std::decay_t<decltype(p0)> _p0_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(_p0_);\
}\
}
#define make_kernel_functor_hip_2(function_name, kernel_name)\
struct make_kernel_name_hip(function_name, 0) {\
void operator()(const hc::tiled_index<3>&) [[hc]]\
{\
return kernel_name(hipLaunchParm{});\
}\
}
#define make_kernel_functor_hip_1(...)
#define make_kernel_functor_hip_0(...)
#define make_kernel_functor_hip_(...)\
overload_macro_hip_(make_kernel_functor_hip_, __VA_ARGS__)
#define hipLaunchNamedKernelGGL(\
function_name,\
kernel_name,\
num_blocks,\
dim_blocks,\
group_mem_bytes,\
stream,\
...)\
do {\
make_kernel_functor_hip_(function_name, kernel_name, __VA_ARGS__)\
hip_kernel_functor_impl_{__VA_ARGS__};\
hip_impl::grid_launch_hip_(\
num_blocks,\
dim_blocks,\
group_mem_bytes,\
stream,\
#kernel_name,\
hip_kernel_functor_impl_);\
} while(0)
#define hipLaunchKernelGGL(\
kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)\
do {\
hipLaunchNamedKernelGGL(\
unnamed,\
kernel_name,\
num_blocks,\
dim_blocks,\
group_mem_bytes,\
stream,\
##__VA_ARGS__);\
} while (0)
#define hipLaunchKernel(\
kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)\
do {\
hipLaunchKernelGGL(\
kernel_name,\
num_blocks,\
dim_blocks,\
group_mem_bytes,\
stream,\
hipLaunchParm{},\
##__VA_ARGS__);\
} while(0)
}
#endif //GENERIC_GRID_LAUNCH
#if __hcc_workweek__ >= 17481
#include "functional_grid_launch.hpp"
#else
#include "macro_based_grid_launch.hpp"
#endif
#endif //GENERIC_GRID_LAUNCH
+6 -1
Vedi File
@@ -44,7 +44,12 @@ THE SOFTWARE.
#if GENERIC_GRID_LAUNCH == 0
#define __global__ __attribute__((hc_grid_launch)) __attribute__((used))
#else
#define __global__ __attribute__((annotate("hip__global__"), hc, used, weak))
#if __hcc_workweek__ >= 17481
#define __global__ \
__attribute__((annotate("__HIP_global_function__"), cpu, hc, used))
#else
#define __global__ __attribute__((hc, used))
#endif
#endif //GENERIC_GRID_LAUNCH
#define __noinline__ __attribute__((noinline))
File diff soppresso perché troppo grande Carica Diff
@@ -0,0 +1,86 @@
/*
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include <hsa/hsa.h>
#include <hsa/hsa_ext_amd.h>
#include <cstddef>
#include <istream>
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
struct ihipModuleSymbol_t;
using hipFunction_t = ihipModuleSymbol_t*;
namespace std
{
template<>
struct hash<hsa_agent_t> {
size_t operator()(hsa_agent_t x) const
{
return hash<decltype(x.handle)>{}(x.handle);
}
};
}
inline
constexpr
bool operator==(hsa_agent_t x, hsa_agent_t y)
{
return x.handle == y.handle;
}
namespace hip_impl
{
struct Kernel_descriptor {
std::uint64_t kernel_object_;
std::uint32_t group_size_;
std::uint32_t private_size_;
std::string name_;
operator hipFunction_t() const
{ // TODO: this is awful and only meant for illustration.
return reinterpret_cast<hipFunction_t>(
const_cast<Kernel_descriptor*>(this));
}
};
using RAII_global = std::unique_ptr<void, decltype(hsa_amd_memory_unlock)*>;
const std::unordered_map<
hsa_agent_t, std::vector<hsa_executable_t>>& executables();
const std::unordered_map<
std::uintptr_t,
std::vector<std::pair<hsa_agent_t, Kernel_descriptor>>>& functions();
const std::unordered_map<std::uintptr_t, std::string>& function_names();
std::unordered_map<std::string, RAII_global>& globals();
hsa_executable_t load_executable(
const std::string& file,
hsa_executable_t executable,
hsa_agent_t agent);
} // Namespace hip_impl.
+39
Vedi File
@@ -0,0 +1,39 @@
#include "../include/hip/hcc_detail/code_object_bundle.hpp"
#include <hsa/hsa.h>
#include <cstdint>
#include <string>
#include <vector>
hsa_isa_t hip_impl::triple_to_hsa_isa(const std::string& triple)
{
static constexpr const char prefix[] = "hcc-amdgcn--amdhsa-gfx";
static constexpr std::size_t prefix_sz = sizeof(prefix) - 1;
hsa_isa_t r = {};
auto idx = triple.find(prefix);
if (idx != std::string::npos) {
idx += prefix_sz;
std::string tmp = "AMD:AMDGPU";
while (idx != triple.size()) {
tmp.push_back(':');
tmp.push_back(triple[idx++]);
}
hsa_isa_from_name(tmp.c_str(), &r);
}
return r;
}
// DATA - STATICS
constexpr const char hip_impl::Bundled_code_header::magic_string_[];
// CREATORS
hip_impl::Bundled_code_header::Bundled_code_header(
const std::vector<std::uint8_t>& x)
: Bundled_code_header{x.cbegin(), x.cend()}
{}
+57 -41
Vedi File
@@ -28,27 +28,6 @@ extern "C" float __ocml_rint_f32(float);
extern "C" float __ocml_ceil_f32(float);
extern "C" float __ocml_trunc_f32(float);
struct holder64Bit{
union{
double d;
unsigned long int uli;
signed long int sli;
signed int si[2];
unsigned int ui[2];
};
} __attribute__((aligned(8)));
struct holder32Bit {
union {
float f;
unsigned int ui;
signed int si;
};
} __attribute__((aligned(4)));
__device__ struct holder64Bit hold64;
__device__ struct holder32Bit hold32;
__device__ float __double2float_rd(double x)
{
return (double)x;
@@ -69,13 +48,21 @@ __device__ float __double2float_rz(double x)
__device__ int __double2hiint(double x)
{
hold64.d = x;
return hold64.si[1];
static_assert(sizeof(double) == 2 * sizeof(int), "");
int tmp[2];
__builtin_memcpy(tmp, &x, sizeof(tmp));
return tmp[1];
}
__device__ int __double2loint(double x)
{
hold64.d = x;
return hold64.si[0];
static_assert(sizeof(double) == 2 * sizeof(int), "");
int tmp[2];
__builtin_memcpy(tmp, &x, sizeof(tmp));
return tmp[0];
}
@@ -150,8 +137,12 @@ __device__ unsigned long long int __double2ull_rz(double x)
__device__ long long int __double_as_longlong(double x)
{
hold64.d = x;
return hold64.sli;
static_assert(sizeof(long long) == sizeof(double), "");
long long tmp;
__builtin_memcpy(&tmp, &x, sizeof(tmp));
return tmp;
}
__device__ int __float2int_rd(float x)
@@ -224,19 +215,32 @@ __device__ unsigned long long int __float2ull_rz(float x)
__device__ int __float_as_int(float x)
{
hold32.f = x;
return hold32.si;
static_assert(sizeof(int) == sizeof(float), "");
int tmp;
__builtin_memcpy(&tmp, &x, sizeof(tmp));
return tmp;
}
__device__ unsigned int __float_as_uint(float x)
{
hold32.f = x;
return hold32.ui;
static_assert(sizeof(unsigned int) == sizeof(float), "");
unsigned int tmp;
__builtin_memcpy(&tmp, &x, sizeof(tmp));
return tmp;
}
__device__ double __hiloint2double(int hi, int lo)
__device__ double __hiloint2double(int32_t hi, int32_t lo)
{
hold64.si[1] = hi;
hold64.si[0] = lo;
return hold64.d;
static_assert(sizeof(double) == sizeof(uint64_t), "");
uint64_t tmp0 =
(static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
double tmp1;
__builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__ double __int2double_rn(int x)
{
@@ -262,8 +266,12 @@ __device__ float __int2float_rz(int x)
__device__ float __int_as_float(int x)
{
hold32.si = x;
return hold32.f;
static_assert(sizeof(float) == sizeof(int), "");
float tmp;
__builtin_memcpy(&tmp, &x, sizeof(tmp));
return tmp;
}
__device__ double __ll2double_rd(long long int x)
@@ -302,8 +310,12 @@ __device__ float __ll2float_rz(long long int x)
__device__ double __longlong_as_double(long long int x)
{
hold64.sli = x;
return hold64.d;
static_assert(sizeof(double) == sizeof(long long), "");
double tmp;
__builtin_memcpy(&tmp, &x, sizeof(tmp));
return x;
}
__device__ double __uint2double_rn(int x)
@@ -330,8 +342,12 @@ __device__ float __uint2float_rz(unsigned int x)
__device__ float __uint_as_float(unsigned int x)
{
hold32.ui = x;
return hold32.f;
static_assert(sizeof(float) == sizeof(unsigned int), "");
float tmp;
__builtin_memcpy(&tmp, &x, sizeof(tmp));
return tmp;
}
__device__ double __ull2double_rd(unsigned long long int x)
+39 -94
Vedi File
@@ -102,111 +102,56 @@ __device__ void* __hip_hc_free(void *ptr)
// loop unrolling
__device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size)
{
uint8_t *dstPtr, *srcPtr;
dstPtr = (uint8_t*)dst;
srcPtr = (uint8_t*)src;
for(uint32_t i=0;i<size;i++) {
dstPtr[i] = srcPtr[i];
auto dstPtr = static_cast<uint8_t*>(dst);
auto srcPtr = static_cast<const uint8_t*>(src);
while (size >= 4u) {
dstPtr[0] = srcPtr[0];
dstPtr[1] = srcPtr[1];
dstPtr[2] = srcPtr[2];
dstPtr[3] = srcPtr[3];
size -= 4u;
srcPtr += 4u;
dstPtr += 4u;
}
return nullptr;
switch (size) {
case 3: dstPtr[2] = srcPtr[2];
case 2: dstPtr[1] = srcPtr[1];
case 1: dstPtr[0] = srcPtr[0];
}
return dst;
}
__device__ void* __hip_hc_memset(void* ptr, uint8_t val, size_t size)
__device__ void* __hip_hc_memset(void* dst, uint8_t val, size_t size)
{
uint8_t *dstPtr;
dstPtr = (uint8_t*)ptr;
for(uint32_t i=0;i<size;i++) {
dstPtr[i] = val;
auto dstPtr = static_cast<uint8_t*>(dst);
while (size >= 4u) {
dstPtr[0] = val;
dstPtr[1] = val;
dstPtr[2] = val;
dstPtr[3] = val;
size -= 4u;
dstPtr += 4u;
}
return nullptr;
switch (size) {
case 3: dstPtr[2] = val;
case 2: dstPtr[1] = val;
case 1: dstPtr[0] = val;
}
return dst;
}
__device__ float __hip_erfinvf(float x){
float ret;
int sign;
if (x < -1 || x > 1){
return NAN;
}
if (x == 0){
return 0;
}
if (x > 0){
sign = 1;
} else {
sign = -1;
x = -x;
}
if (x <= 0.7) {
float x1 = x * x;
float x2 = __hip_erfinva3 * x1 + __hip_erfinva2;
float x3 = x2 * x1 + __hip_erfinva1;
float x4 = x * (x3 * x1 + __hip_erfinva0);
float r1 = __hip_erfinvb4 * x1 + __hip_erfinvb3;
float r2 = r1 * x1 + __hip_erfinvb2;
float r3 = r2 * x1 + __hip_erfinvb1;
ret = x4 / (r3 * x1 + __hip_erfinvb0);
} else {
float x1 = hc::precise_math::sqrtf(-hc::precise_math::logf((1 - x) / 2));
float x2 = __hip_erfinvc3 * x1 + __hip_erfinvc2;
float x3 = x2 * x1 + __hip_erfinvc1;
float x4 = x3 * x1 + __hip_erfinvc0;
float r1 = __hip_erfinvd2 * x1 + __hip_erfinvd1;
ret = x4 / (r1 * x1 + __hip_erfinvd0);
}
ret = ret * sign;
x = x * sign;
ret -= (hc::precise_math::erff(ret) - x) / (2 / HIP_SQRT_PI * hc::precise_math::expf(-ret * ret));
ret -= (hc::precise_math::erff(ret) - x) / (2 / HIP_SQRT_PI * hc::precise_math::expf(-ret * ret));
return ret;
return hc::precise_math::erfinvf(x);
}
__device__ double __hip_erfinv(double x){
double ret;
int sign;
if (x < -1 || x > 1){
return NAN;
}
if (x == 0){
return 0;
}
if (x > 0){
sign = 1;
} else {
sign = -1;
x = -x;
}
if (x <= 0.7) {
double x1 = x * x;
double x2 = __hip_erfinva3 * x1 + __hip_erfinva2;
double x3 = x2 * x1 + __hip_erfinva1;
double x4 = x * (x3 * x1 + __hip_erfinva0);
double r1 = __hip_erfinvb4 * x1 + __hip_erfinvb3;
double r2 = r1 * x1 + __hip_erfinvb2;
double r3 = r2 * x1 + __hip_erfinvb1;
ret = x4 / (r3 * x1 + __hip_erfinvb0);
} else {
double x1 = hc::precise_math::sqrt(-hc::precise_math::log((1 - x) / 2));
double x2 = __hip_erfinvc3 * x1 + __hip_erfinvc2;
double x3 = x2 * x1 + __hip_erfinvc1;
double x4 = x3 * x1 + __hip_erfinvc0;
double r1 = __hip_erfinvd2 * x1 + __hip_erfinvd1;
ret = x4 / (r1 * x1 + __hip_erfinvd0);
}
ret = ret * sign;
x = x * sign;
ret -= (hc::precise_math::erf(ret) - x) / (2 / HIP_SQRT_PI * hc::precise_math::exp(-ret * ret));
ret -= (hc::precise_math::erf(ret) - x) / (2 / HIP_SQRT_PI * hc::precise_math::exp(-ret * ret));
return ret;
return hc::precise_math::erfinv(x);
}
#define __hip_j0a1 57568490574.0
+137
Vedi File
@@ -0,0 +1,137 @@
/*
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "hip/hcc_detail/program_state.hpp"
#include "hip/hip_runtime_api.h"
// Internal header, do not percolate upwards.
#include "hip_hcc_internal.h"
#include "hc.hpp"
#include "trace_helper.h"
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <stdexcept>
#include <iostream>
using namespace hc;
using namespace std;
namespace hip_impl
{
namespace
{
inline
string name(uintptr_t function_address)
{
const auto it = function_names().find(function_address);
if (it == function_names().cend()) {
throw runtime_error{
"Invalid function passed to hipLaunchKernelGGL."};
}
return it->second;
}
inline
string name(hsa_agent_t agent)
{
char n[64] = {};
hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, n);
return string{n};
}
inline
hsa_agent_t target_agent(hipStream_t stream)
{
if (stream) {
return *static_cast<hsa_agent_t*>(
stream->locked_getAv()->get_hsa_agent());
}
else if (
ihipGetTlsDefaultCtx() && ihipGetTlsDefaultCtx()->getDevice()) {
return ihipGetDevice(
ihipGetTlsDefaultCtx()->getDevice()->_deviceId)->_hsaAgent;
}
else {
return *static_cast<hsa_agent_t*>(
accelerator{}.get_default_view().get_hsa_agent());
}
}
}
void hipLaunchKernelGGLImpl(
uintptr_t function_address,
const dim3& numBlocks,
const dim3& dimBlocks,
uint32_t sharedMemBytes,
hipStream_t stream,
void** kernarg)
{
const auto it0 = functions().find(function_address);
if (it0 == functions().cend()) {
throw runtime_error{
"No device code available for function: " +
name(function_address)
};
}
auto agent = target_agent(stream);
const auto it1 = find_if(
it0->second.cbegin(),
it0->second.cend(),
[=](const pair<hsa_agent_t, Kernel_descriptor>& x) {
return x.first.handle == agent.handle;
});
if (it1 == it0->second.cend()) {
throw runtime_error{
"No code available for function: " + name(function_address) +
", for agent: " + name(agent)
};
}
for (auto&& agent_kernel : it0->second) {
if (agent.handle == agent_kernel.first.handle) {
hipModuleLaunchKernel(
agent_kernel.second,
numBlocks.x,
numBlocks.y,
numBlocks.z,
dimBlocks.x,
dimBlocks.y,
dimBlocks.z,
sharedMemBytes,
stream,
nullptr,
kernarg);
}
}
}
}
+5 -75
Vedi File
@@ -22,78 +22,8 @@ THE SOFTWARE.
#include "hip/hcc_detail/grid_launch_GGL.hpp"
// Internal header, do not percolate upwards.
#include "hip_hcc_internal.h"
#include "hc.hpp"
#include "trace_helper.h"
#include <iostream>
#include <sstream>
namespace hip_impl
{
hc::accelerator_view lock_stream_hip_(
hipStream_t& stream, void*& locked_stream)
{ // This allocated but does not take ownership of locked_stream. If it is
// not deleted elsewhere it will leak.
using L = decltype(stream->lockopen_preKernelCommand());
HIP_INIT();
stream = ihipSyncAndResolveStream(stream);
locked_stream = new L{stream->lockopen_preKernelCommand()};
return (*static_cast<L*>(locked_stream))->_av;
}
void print_prelaunch_trace_(
const char* kernel_name,
dim3 num_blocks,
dim3 dim_blocks,
int group_mem_bytes,
hipStream_t stream)
{
if ((HIP_TRACE_API & (1 << TRACE_KCMD)) ||
HIP_PROFILE_API ||
(COMPILE_HIP_DB && (HIP_TRACE_API & (1<<TRACE_ALL)))) {
std::stringstream os;
os << tls_tidInfo.tid() << "." << tls_tidInfo.apiSeqNum()
<< " hipLaunchKernel '" << kernel_name << "'"
<< " gridDim:" << num_blocks
<< " groupDim:" << dim_blocks
<< " sharedMem:+" << group_mem_bytes
<< " " << *stream;
if (HIP_PROFILE_API == 0x1) {
std::string shortAtpString("hipLaunchKernel:");
shortAtpString += kernel_name;
MARKER_BEGIN(shortAtpString.c_str(), "HIP");
} else if (HIP_PROFILE_API == 0x2) {
MARKER_BEGIN(os.str().c_str(), "HIP");
}
if (COMPILE_HIP_DB && HIP_TRACE_API) {
std::string fullStr;
recordApiTrace(&fullStr, os.str());
}
}
}
void unlock_stream_hip_(
hipStream_t stream,
void* locked_stream,
const char* kernel_name,
hc::accelerator_view* acc_v)
{ // Precondition: acc_v is the accelerator_view associated with stream
// which is guarded by locked_stream;
// locked_stream is deletable.
using L = decltype(stream->lockopen_preKernelCommand());
stream->lockclose_postKernelCommand(kernel_name, acc_v);
delete static_cast<L*>(locked_stream);
locked_stream = nullptr;
if(HIP_PROFILE_API) {
MARKER_END();
}
}
}
#if __hcc_workweek__ >= 17481
#include "functional_grid_launch.inl"
#else
#include "macro_based_grid_launch.inl"
#endif
+3 -3
Vedi File
@@ -446,14 +446,14 @@ hipError_t hipChooseDevice( int* device, const hipDeviceProp_t* prop )
{
HIP_INIT_API(device,prop);
hipDeviceProp_t tempProp;
int deviceCount;
int inPropCount = 0;
int matchedPropCount = 0;
hipError_t e = hipSuccess;
if((device == NULL) || (prop == NULL)) {
e = hipErrorInvalidValue;
}
if(e == hipSuccess) {
int deviceCount;
int inPropCount = 0;
int matchedPropCount = 0;
ihipGetDeviceCount( &deviceCount );
*device = 0;
for (int i = 0; i < deviceCount; i++) {
+2
Vedi File
@@ -779,6 +779,8 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop)
// Get agent name
err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, &(prop->name));
DeviceErrorCheck(err);
char archName[256];
err = hsa_agent_get_info(_hsaAgent, HSA_AGENT_INFO_NAME, &archName);
+19 -18
Vedi File
@@ -249,7 +249,7 @@ static const DbName dbName [] =
#define tprintf(trace_level, ...) {\
if (HIP_DB & (1<<(trace_level))) {\
char msgStr[1000];\
snprintf(msgStr, 2000, __VA_ARGS__);\
snprintf(msgStr, sizeof(msgStr), __VA_ARGS__);\
fprintf (stderr, " %ship-%s tid:%d:%s%s", dbName[trace_level]._color, dbName[trace_level]._shortName, tls_tidInfo.tid(), msgStr, KNRM); \
}\
}
@@ -269,7 +269,7 @@ extern uint64_t recordApiTrace(std::string *fullStr, const std::string &apiStr);
#if COMPILE_HIP_ATP_MARKER || (COMPILE_HIP_TRACE_API & 0x1)
#define API_TRACE(forceTrace, ...)\
uint64_t hipApiStartTick;\
uint64_t hipApiStartTick=0;\
{\
tls_tidInfo.incApiSeqNum();\
if (forceTrace || (HIP_PROFILE_API || (COMPILE_HIP_DB && (HIP_TRACE_API & (1<<TRACE_ALL))))) {\
@@ -339,7 +339,7 @@ uint64_t hipApiStartTick;\
class ihipException : public std::exception
{
public:
ihipException(hipError_t e) : _code(e) {};
explicit ihipException(hipError_t e) : _code(e) {};
hipError_t _code;
};
@@ -372,16 +372,16 @@ public:
};
class ihipModule_t {
public:
hsa_executable_t executable;
hsa_code_object_t object;
struct ihipModule_t {
std::string fileName;
void *ptr;
size_t size;
std::list<hipFunction_t> funcTrack;
std::unordered_map<std::string, uintptr_t> coGlobals;
ihipModule_t() : executable(), object(), fileName(), ptr(nullptr), size(0) {}
hsa_executable_t executable = {};
hsa_code_object_reader_t coReader = {};
~ihipModule_t()
{
if (executable.handle) hsa_executable_destroy(executable);
if (coReader.handle) hsa_code_object_reader_destroy(coReader);
}
};
@@ -669,11 +669,11 @@ template <typename MUTEX_TYPE>
class ihipEventCriticalBase_t : LockedBase<MUTEX_TYPE>
{
public:
ihipEventCriticalBase_t(const ihipEvent_t *parentEvent) :
explicit ihipEventCriticalBase_t(const ihipEvent_t *parentEvent) :
_parent(parentEvent)
{}
~ihipEventCriticalBase_t() {};
// Keep data in structure so it can be easily copied into snapshots
// (used to reduce lock contention and preserve correct lock order)
ihipEventData_t _eventData;
@@ -690,7 +690,7 @@ typedef LockedAccessor<ihipEventCritical_t> LockedAccessor_EventCrit_t;
// internal hip event structure.
class ihipEvent_t {
public:
ihipEvent_t(unsigned flags);
explicit ihipEvent_t(unsigned flags);
void attachToCompletionFuture(const hc::completion_future *cf, hipStream_t stream, ihipEventType_t eventType);
std::pair<hipEventStatus_t, uint64_t> refreshEventStatus(); // returns pair <state, timestamp>
@@ -698,7 +698,7 @@ public:
// Return a copy of the critical state. The critical data is locked during the copy.
ihipEventData_t locked_copyCrit() {
LockedAccessor_EventCrit_t crit(_criticalData);
return _criticalData._eventData;
return _criticalData._eventData;
};
ihipEventCritical_t &criticalData() { return _criticalData; };
@@ -720,8 +720,9 @@ template <typename MUTEX_TYPE>
class ihipDeviceCriticalBase_t : LockedBase<MUTEX_TYPE>
{
public:
ihipDeviceCriticalBase_t(ihipDevice_t *parentDevice) :
_parent(parentDevice)
explicit ihipDeviceCriticalBase_t(ihipDevice_t *parentDevice) :
_parent(parentDevice),
_ctxCount(0)
{
};
+107 -92
Vedi File
@@ -44,7 +44,7 @@ hipError_t memcpyAsync (void* dst, const void* src, size_t sizeBytes, hipMemcpyK
try {
stream->locked_copyAsync(dst, src, sizeBytes, kind);
}
catch (ihipException ex) {
catch (ihipException &ex) {
e = ex._code;
}
} else {
@@ -808,6 +808,26 @@ hipError_t hipHostUnregister(void *hostPtr)
return ihipLogStatus(hip_status);
}
namespace
{
inline
hipDeviceptr_t agent_address_for_symbol(const char* symbolName)
{
hipDeviceptr_t r = nullptr;
#if __hcc_workweek__ >= 17481
size_t byte_cnt = 0u;
hipModuleGetGlobal(&r, &byte_cnt, 0, symbolName);
#else
auto ctx = ihipGetTlsDefaultCtx();
auto acc = ctx->getDevice()->_acc;
r = acc.get_symbol_address(symbolName);
#endif
return r;
}
}
hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t count, size_t offset, hipMemcpyKind kind)
{
HIP_INIT_SPECIAL_API((TRACE_MCMD), symbolName, src, count, offset, kind);
@@ -821,7 +841,8 @@ hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t cou
hc::accelerator acc = ctx->getDevice()->_acc;
void *dst = acc.get_symbol_address((const char*) symbolName);
hipDeviceptr_t dst =
agent_address_for_symbol(static_cast<const char*>(symbolName));
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst);
if(dst == nullptr)
@@ -856,7 +877,8 @@ hipError_t hipMemcpyFromSymbol(void* dst, const void* symbolName, size_t count,
hc::accelerator acc = ctx->getDevice()->_acc;
void *src = acc.get_symbol_address((const char*) symbolName);
hipDeviceptr_t src =
agent_address_for_symbol(static_cast<const char*>(symbolName));
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst);
if(dst == nullptr)
@@ -893,7 +915,8 @@ hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void *src, size_
hc::accelerator acc = ctx->getDevice()->_acc;
void *dst = acc.get_symbol_address((const char*) symbolName);
hipDeviceptr_t dst =
agent_address_for_symbol(static_cast<const char*>(symbolName));
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst);
if(dst == nullptr)
@@ -905,7 +928,7 @@ hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void *src, size_
try {
stream->lockedSymbolCopyAsync(acc, dst, (void*)src, count, offset, kind);
}
catch (ihipException ex) {
catch (ihipException &ex) {
e = ex._code;
}
} else {
@@ -931,7 +954,8 @@ hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName, size_t co
hc::accelerator acc = ctx->getDevice()->_acc;
void *src = acc.get_symbol_address((const char*) symbolName);
hipDeviceptr_t src =
agent_address_for_symbol(static_cast<const char*>(symbolName));
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, src);
if(src == nullptr || dst == nullptr)
@@ -944,7 +968,7 @@ hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName, size_t co
try {
stream->lockedSymbolCopyAsync(acc, dst, src, count, offset, kind);
}
catch (ihipException ex) {
catch (ihipException &ex) {
e = ex._code;
}
} else {
@@ -969,7 +993,7 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind
stream->locked_copySync(dst, src, sizeBytes, kind);
}
catch (ihipException ex) {
catch (ihipException &ex) {
e = ex._code;
}
@@ -991,7 +1015,7 @@ hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t sizeBytes)
stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyHostToDevice, false);
}
catch (ihipException ex) {
catch (ihipException &ex) {
e = ex._code;
}
@@ -1013,7 +1037,7 @@ hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t sizeBytes)
stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyDeviceToHost, false);
}
catch (ihipException ex) {
catch (ihipException &ex) {
e = ex._code;
}
@@ -1035,7 +1059,7 @@ hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeByte
stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyDeviceToDevice, false);
}
catch (ihipException ex) {
catch (ihipException &ex) {
e = ex._code;
}
@@ -1057,7 +1081,7 @@ hipError_t hipMemcpyHtoH(void* dst, void* src, size_t sizeBytes)
stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyHostToHost, false);
}
catch (ihipException ex) {
catch (ihipException &ex) {
e = ex._code;
}
@@ -1116,7 +1140,7 @@ hipError_t ihipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch
stream->locked_copySync((unsigned char*)dst + i*dpitch, (unsigned char*)src + i*spitch, width, kind);
}
}
catch (ihipException ex) {
catch (ihipException &ex) {
e = ex._code;
}
@@ -1154,7 +1178,7 @@ hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t sp
e = hip_internal::memcpyAsync((unsigned char*)dst + i*dpitch, (unsigned char*)src + i*spitch, width, kind,stream);
}
}
catch (ihipException ex) {
catch (ihipException &ex) {
e = ex._code;
}
@@ -1207,7 +1231,7 @@ hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset, con
stream->locked_copySync((unsigned char*)dst->data + i*dst_w, (unsigned char*)src + i*src_w, width, kind);
}
}
catch (ihipException ex) {
catch (ihipException &ex) {
e = ex._code;
}
@@ -1228,7 +1252,7 @@ hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset,
try {
stream->locked_copySync((char *)dst->data + wOffset, src, count, kind);
}
catch (ihipException ex) {
catch (ihipException &ex) {
e = ex._code;
}
@@ -1278,49 +1302,66 @@ hipError_t hipMemcpy3D(const struct hipMemcpy3DParms *p)
}
}
}
catch (ihipException ex) {
catch (ihipException &ex) {
e = ex._code;
}
return ihipLogStatus(e);
}
// TODO - make member function of stream?
namespace
{
template<
uint32_t block_dim,
typename RandomAccessIterator,
typename N,
typename T>
__global__
void hip_fill_n(RandomAccessIterator f, N n, T value)
{
const uint32_t grid_dim = gridDim.x * blockDim.x;
size_t idx = blockIdx.x * block_dim + threadIdx.x;
while (idx < n) {
__builtin_memcpy(
reinterpret_cast<void*>(&f[idx]),
reinterpret_cast<const void*>(&value),
sizeof(T));
idx += grid_dim;
}
}
template<
typename T,
typename std::enable_if<std::is_integral<T>{}>::type* = nullptr>
inline
const T& clamp_integer(const T& x, const T& lower, const T& upper)
{
assert(!(upper < lower));
return std::min(upper, std::max(x, lower));
}
}
template <typename T>
void
ihipMemsetKernel(hipStream_t stream,
LockedAccessor_StreamCrit_t &crit,
T * ptr, T val, size_t sizeBytes,
hc::completion_future *cf)
T * ptr, T val, size_t sizeBytes)
{
int wg = std::min((unsigned)8, stream->getDevice()->_computeUnits);
const int threads_per_wg = 256;
static constexpr uint32_t block_dim = 256;
int threads = wg * threads_per_wg;
if (threads > sizeBytes) {
threads = ((sizeBytes + threads_per_wg - 1) / threads_per_wg) * threads_per_wg;
}
hc::extent<1> ext(threads);
auto ext_tile = ext.tile(threads_per_wg);
*cf =
hc::parallel_for_each(
crit->_av,
ext_tile,
[=] (hc::tiled_index<1> idx)
__attribute__((hc))
{
int offset = amp_get_global_id(0);
// TODO-HCC - change to hc_get_local_size()
int stride = amp_get_local_size(0) * hc_get_num_groups(0) ;
for (int i=offset; i<sizeBytes; i+=stride) {
ptr[i] = val;
}
});
const uint32_t grid_dim = clamp_integer<size_t>(
sizeBytes / block_dim, 1, UINT32_MAX);
hipLaunchKernelGGL(
hip_fill_n<block_dim>,
dim3(grid_dim),
dim3{block_dim},
0u,
stream,
ptr,
sizeBytes,
std::move(val));
}
@@ -1334,17 +1375,12 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s
stream = ihipSyncAndResolveStream(stream);
if (stream) {
auto crit = stream->lockopen_preKernelCommand();
hc::completion_future cf ;
if ((sizeBytes & 0x3) == 0) {
// use a faster dword-per-workitem copy:
try {
value = value & 0xff;
uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
ihipMemsetKernel<uint32_t> (stream, crit, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t), &cf);
ihipMemsetKernel<uint32_t> (stream, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t));
}
catch (std::exception &ex) {
e = hipErrorInvalidValue;
@@ -1352,19 +1388,16 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s
} else {
// use a slow byte-per-workitem copy:
try {
ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes, &cf);
ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
}
catch (std::exception &ex) {
e = hipErrorInvalidValue;
}
}
stream->lockclose_postKernelCommand("hipMemsetAsync", &crit->_av);
if (HIP_API_BLOCKING) {
tprintf (DB_SYNC, "%s LAUNCH_BLOCKING wait for hipMemsetAsync.\n", ToString(stream).c_str());
cf.wait();
stream->locked_wait();
}
} else {
e = hipErrorInvalidValue;
@@ -1385,16 +1418,12 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes)
stream = ihipSyncAndResolveStream(stream);
if (stream) {
auto crit = stream->lockopen_preKernelCommand();
hc::completion_future cf ;
if ((sizeBytes & 0x3) == 0) {
// use a faster dword-per-workitem copy:
try {
value = value & 0xff;
uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
ihipMemsetKernel<uint32_t> (stream, crit, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t), &cf);
ihipMemsetKernel<uint32_t> (stream, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t));
}
catch (std::exception &ex) {
e = hipErrorInvalidValue;
@@ -1402,21 +1431,18 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes)
} else {
// use a slow byte-per-workitem copy:
try {
ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes, &cf);
ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
}
catch (std::exception &ex) {
e = hipErrorInvalidValue;
}
}
// TODO - is hipMemset supposed to be async?
cf.wait();
stream->lockclose_postKernelCommand("hipMemset", &crit->_av);
stream->locked_wait();
if (HIP_LAUNCH_BLOCKING) {
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset in %s.\n", __func__, ToString(stream).c_str());
cf.wait();
stream->locked_wait();
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed in %s.\n", __func__, ToString(stream).c_str());
}
} else {
@@ -1437,17 +1463,13 @@ hipError_t hipMemset2D(void* dst, size_t pitch, int value, size_t width, size_t
stream = ihipSyncAndResolveStream(stream);
if (stream) {
auto crit = stream->lockopen_preKernelCommand();
hc::completion_future cf ;
size_t sizeBytes = pitch * height;
if ((sizeBytes & 0x3) == 0) {
// use a faster dword-per-workitem copy:
try {
value = value & 0xff;
uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
ihipMemsetKernel<uint32_t> (stream, crit, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t), &cf);
ihipMemsetKernel<uint32_t> (stream, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t));
}
catch (std::exception &ex) {
e = hipErrorInvalidValue;
@@ -1455,20 +1477,18 @@ hipError_t hipMemset2D(void* dst, size_t pitch, int value, size_t width, size_t
} else {
// use a slow byte-per-workitem copy:
try {
ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes, &cf);
ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
}
catch (std::exception &ex) {
e = hipErrorInvalidValue;
}
}
// TODO - is hipMemset supposed to be async?
cf.wait();
stream->lockclose_postKernelCommand("hipMemset", &crit->_av);
stream->locked_wait();
if (HIP_LAUNCH_BLOCKING) {
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset in %s.\n", __func__, ToString(stream).c_str());
cf.wait();
stream->locked_wait();
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed in %s.\n", __func__, ToString(stream).c_str());
}
} else {
@@ -1489,36 +1509,30 @@ hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t sizeByte
stream = ihipSyncAndResolveStream(stream);
if (stream) {
auto crit = stream->lockopen_preKernelCommand();
hc::completion_future cf ;
if ((sizeBytes & 0x3) == 0) {
// use a faster dword-per-workitem copy:
try {
uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
ihipMemsetKernel<uint32_t> (stream, crit, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t), &cf);
ihipMemsetKernel<uint32_t> (stream, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t));
}
catch (std::exception &ex) {
std::cout << ex.what() << std::endl;
e = hipErrorInvalidValue;
}
} else {
// use a slow byte-per-workitem copy:
try {
ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes, &cf);
ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
}
catch (std::exception &ex) {
e = hipErrorInvalidValue;
}
}
cf.wait();
stream->lockclose_postKernelCommand("hipMemsetD8", &crit->_av);
stream->locked_wait();
if (HIP_LAUNCH_BLOCKING) {
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset in %s.\n", __func__, ToString(stream).c_str());
cf.wait();
stream->locked_wait();
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed in %s.\n", __func__, ToString(stream).c_str());
}
} else {
@@ -1717,7 +1731,7 @@ hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* devPtr){
HIP_INIT_API ( handle, devPtr);
hipError_t hipStatus = hipSuccess;
// Get the size of allocated pointer
size_t psize;
size_t psize = 0u;
hc::accelerator acc;
if((handle == NULL) || (devPtr == NULL)) {
hipStatus = hipErrorInvalidResourceHandle;
@@ -1730,8 +1744,9 @@ hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* devPtr){
am_status_t status = hc::am_memtracker_getinfo( &amPointerInfo , devPtr );
if (status == AM_SUCCESS) {
psize = (size_t)amPointerInfo._sizeBytes;
} else
} else {
hipStatus = hipErrorInvalidResourceHandle;
}
ihipIpcMemHandle_t* iHandle = (ihipIpcMemHandle_t*) handle;
// Save the size of the pointer to hipIpcMemHandle
iHandle->psize = psize;
+233 -517
Vedi File
@@ -20,62 +20,65 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <fstream>
#include <stdio.h>
#include <stdlib.h>
#include <cstdint>
#include <memory>
#include <mutex>
#include <string>
#include <unordered_map>
#include <vector>
#include <map>
#include "elfio/elfio.hpp"
#include "hip/hip_runtime.h"
#include "hip/hcc_detail/program_state.hpp"
#include "hip_hcc_internal.h"
#include "hsa_helpers.hpp"
#include "trace_helper.h"
#include <hsa/hsa.h>
#include <hsa/hsa_ext_amd.h>
#include <hsa/amd_hsa_kernel_code.h>
#include "elfio/elfio.hpp"
#include "hip/hip_runtime.h"
#include "hip_hcc_internal.h"
#include "trace_helper.h"
#include <cassert>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <fstream>
#include <map>
#include <memory>
#include <mutex>
#include <sstream>
#include <stdexcept>
#include <string>
#include <tuple>
#include <unordered_map>
#include <utility>
#include <vector>
//TODO Use Pool APIs from HCC to get memory regions.
#include <cassert>
using namespace ELFIO;
using namespace hip_impl;
using namespace std;
inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
assert(Align != 0u && "Align can't be 0.");
Skew %= Align;
return (Value + Align - 1 - Skew) / Align * Align + Skew;
}
struct ihipKernArgInfo{
std::vector<uint32_t> Size;
std::vector<uint32_t> Align;
std::vector<std::string> ArgType;
std::vector<std::string> ArgName;
vector<uint32_t> Size;
vector<uint32_t> Align;
vector<string> ArgType;
vector<string> ArgName;
uint32_t totalSize;
};
std::map<std::string,struct ihipKernArgInfo> kernelArguments;
struct MyElfNote {
uint32_t n_namesz = 0;
uint32_t n_descsz = 0;
uint32_t n_type = 0;
MyElfNote() = default;
};
map<string, ihipKernArgInfo> kernelArguments;
struct ihipModuleSymbol_t{
uint64_t _object; // The kernel object.
uint32_t _groupSegmentSize;
uint32_t _privateSegmentSize;
std::string _name; // TODO - review for performance cost. Name is just used for debug.
uint64_t _object; // The kernel object.
uint32_t _groupSegmentSize;
uint32_t _privateSegmentSize;
string _name; // TODO - review for performance cost. Name is just used for debug.
};
template <>
std::string ToString(hipFunction_t v)
string ToString(hipFunction_t v)
{
std::ostringstream ss;
ss << "0x" << std::hex << v->_object;
@@ -93,287 +96,20 @@ if (hsaStatus != HSA_STATUS_SUCCESS) {\
return ihipLogStatus(hipStatus);\
}
namespace hipdrv {
hsa_status_t findSystemRegions(hsa_region_t region, void *data){
hsa_region_segment_t segment_id;
hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment_id);
if(segment_id != HSA_REGION_SEGMENT_GLOBAL){
return HSA_STATUS_SUCCESS;
}
hsa_region_global_flag_t flags;
hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
hsa_region_t *reg = (hsa_region_t*)data;
if(flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED){
*reg = region;
}
return HSA_STATUS_SUCCESS;
}
} // End namespace hipdrv
uint64_t PrintSymbolSizes(const void *emi, const char *name){
using namespace ELFIO;
const Elf64_Ehdr *ehdr = (const Elf64_Ehdr*)emi;
if(NULL == ehdr || EV_CURRENT != ehdr->e_version){}
const Elf64_Shdr * shdr = (const Elf64_Shdr*)((char*)emi + ehdr->e_shoff);
for(uint16_t i=0;i<ehdr->e_shnum;++i){
if(shdr[i].sh_type == SHT_SYMTAB){
const Elf64_Sym *syms = (const Elf64_Sym*)((char*)emi + shdr[i].sh_offset);
assert(syms);
uint64_t numSyms = shdr[i].sh_size/shdr[i].sh_entsize;
const char* strtab = (const char*)((char*)emi + shdr[shdr[i].sh_link].sh_offset);
assert(strtab);
for(uint64_t i=0;i<numSyms;++i){
const char *symname = strtab + syms[i].st_name;
assert(symname);
uint64_t size = syms[i].st_size;
if(strcmp(name, symname) == 0){
return size;
}
}
}
}
return 0;
}
uint64_t ElfSize(const void *emi){
using namespace ELFIO;
const Elf64_Ehdr *ehdr = (const Elf64_Ehdr*)emi;
const Elf64_Shdr *shdr = (const Elf64_Shdr*)((char*)emi + ehdr->e_shoff);
uint64_t max_offset = ehdr->e_shoff;
uint64_t total_size = max_offset + ehdr->e_shentsize * ehdr->e_shnum;
for(uint16_t i=0;i < ehdr->e_shnum;++i){
uint64_t cur_offset = static_cast<uint64_t>(shdr[i].sh_offset);
if(max_offset < cur_offset){
max_offset = cur_offset;
total_size = max_offset;
if(SHT_NOBITS != shdr[i].sh_type){
total_size += static_cast<uint64_t>(shdr[i].sh_size);
}
}
}
return total_size;
}
namespace
{
template<typename P>
inline
ELFIO::section* find_section_if(ELFIO::elfio& reader, P p)
{
using namespace std;
const auto it = find_if(
reader.sections.begin(), reader.sections.end(), move(p));
return it != reader.sections.end() ? *it : nullptr;
}
inline
std::vector<std::string> copy_names_of_undefined_symbols(
const ELFIO::symbol_section_accessor& section)
{
using namespace ELFIO;
using namespace std;
vector<string> r;
for (auto i = 0u; i != section.get_symbols_num(); ++i) {
// TODO: this is boyscout code, caching the temporaries
// may be of worth.
string name;
Elf64_Addr value = 0;
Elf_Xword size = 0;
Elf_Half sect_idx = 0;
uint8_t bind = 0;
uint8_t type = 0;
uint8_t other = 0;
section.get_symbol(
i, name, value, size, bind, type, sect_idx, other);
if (sect_idx == SHN_UNDEF && !name.empty()) {
r.push_back(std::move(name));
}
}
return r;
}
inline
std::pair<ELFIO::Elf64_Addr, ELFIO::Elf_Xword> find_symbol_address(
const ELFIO::symbol_section_accessor& section,
const std::string& symbol_name)
{
using namespace ELFIO;
using namespace std;
static const pair<Elf64_Addr, Elf_Xword> r{0, 0};
for (auto i = 0u; i != section.get_symbols_num(); ++i) {
// TODO: this is boyscout code, caching the temporaries
// may be of worth.
string name;
Elf64_Addr value = 0;
Elf_Xword size = 0;
Elf_Half sect_idx = 0;
uint8_t bind = 0;
uint8_t type = 0;
uint8_t other = 0;
section.get_symbol(
i, name, value, size, bind, type, sect_idx, other);
if (name == symbol_name) return make_pair(value, size);
}
return r;
}
inline
void associate_code_object_symbols_with_host_allocation(
hipModule_t module,
const ELFIO::elfio& reader,
const ELFIO::elfio& self_reader,
ELFIO::section* code_object_dynsym,
ELFIO::section* process_symtab,
hsa_agent_t agent,
hsa_executable_t executable)
{
using namespace ELFIO;
using namespace std;
if (!code_object_dynsym || !process_symtab) return;
const auto undefined_symbols = copy_names_of_undefined_symbols(
symbol_section_accessor{reader, code_object_dynsym});
for (auto&& x : undefined_symbols) {
const auto tmp = find_symbol_address(
symbol_section_accessor{self_reader, process_symtab}, x);
assert(tmp.first);
void* p = nullptr;
hsa_amd_memory_lock(
reinterpret_cast<void*>(tmp.first), tmp.second, &agent, 1, &p);
hsa_executable_agent_global_variable_define(
executable, agent, x.c_str(), p);
static vector<
unique_ptr<void, decltype(hsa_amd_memory_unlock)*>> globals;
static mutex mtx;
lock_guard<std::mutex> lck{mtx};
globals.emplace_back(p, hsa_amd_memory_unlock);
if (module->coGlobals.count(x) == 0) {
module->coGlobals.emplace(x, tmp.first);
}
}
}
inline
void load_code_object_and_freeze_executable(
const char* file, hsa_agent_t agent, hsa_executable_t executable)
{ // TODO: the following sequence is inefficient, should be refactored
// into a single load of the file and subsequent ELFIO
// processing.
using namespace std;
static const auto cor_deleter = [](hsa_code_object_reader_t* p) {
hsa_code_object_reader_destroy(*p);
};
using RAII_code_reader = unique_ptr<
hsa_code_object_reader_t, decltype(cor_deleter)>;
unique_ptr<FILE, decltype(fclose)*> cobj{fopen(file, "r"), fclose};
RAII_code_reader tmp{new hsa_code_object_reader_t, cor_deleter};
hsa_code_object_reader_create_from_file(fileno(cobj.get()), tmp.get());
hsa_executable_load_agent_code_object(
executable, agent, *tmp, nullptr, nullptr);
hsa_executable_freeze(executable, nullptr);
static vector<RAII_code_reader> code_readers;
static mutex mtx;
lock_guard<mutex> lck{mtx};
code_readers.push_back(move(tmp));
}
}
hipError_t hipModuleLoad(hipModule_t *module, const char *fname)
{
using namespace ELFIO;
HIP_INIT_API(module, fname);
hipError_t ret = hipSuccess;
*module = new ihipModule_t;
if(module == NULL){
return ihipLogStatus(hipErrorInvalidValue);
}
if (!fname) return ihipLogStatus(hipErrorInvalidValue);
auto ctx = ihipGetTlsDefaultCtx();
if(ctx == nullptr){
ret = hipErrorInvalidContext;
ifstream file{fname};
}else{
int deviceId = ctx->getDevice()->_deviceId;
ihipDevice_t *currentDevice = ihipGetDevice(deviceId);
if (!file.is_open()) return ihipLogStatus(hipErrorFileNotFound);
hsa_executable_create_alt(
HSA_PROFILE_FULL,
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
nullptr,
&(*module)->executable);
vector<char> tmp{
istreambuf_iterator<char>{file}, istreambuf_iterator<char>{}};
elfio reader;
if (!reader.load(fname)) {
return ihipLogStatus(hipErrorFileNotFound);
}
else {
// TODO: this may benefit from caching as well.
elfio self_reader;
self_reader.load("/proc/self/exe");
const auto symtab =
find_section_if(self_reader, [](const ELFIO::section* x) {
return x->get_type() == SHT_SYMTAB;
});
const auto code_object_dynsym =
find_section_if(reader, [](const ELFIO::section* x) {
return x->get_type() == SHT_DYNSYM;
});
associate_code_object_symbols_with_host_allocation(
*module,
reader,
self_reader,
code_object_dynsym,
symtab,
currentDevice->_hsaAgent,
(*module)->executable);
load_code_object_and_freeze_executable(
fname, currentDevice->_hsaAgent, (*module)->executable);
}
}
return ihipLogStatus(ret);
return hipModuleLoadData(module, tmp.data());
}
@@ -385,92 +121,13 @@ hipError_t hipModuleUnload(hipModule_t hmod)
// Currently we want for all inflight activity to complete, but don't prevent another
// thread from launching new kernels before we finish this operation.
ihipSynchronize();
hipError_t ret = hipSuccess;
hsa_status_t status = hsa_executable_destroy(hmod->executable);
if(status != HSA_STATUS_SUCCESS)
{
ret = hipErrorInvalidValue;
}
// status = hsa_code_object_destroy(hmod->object);
// if(status != HSA_STATUS_SUCCESS)
// {
// ret = hipErrorInvalidValue;
// }
// status = hsa_memory_free(hmod->ptr);
// if(status != HSA_STATUS_SUCCESS)
// {
// ret = hipErrorInvalidValue;
// }
for(auto f = hmod->funcTrack.begin(); f != hmod->funcTrack.end(); ++f) {
delete *f;
}
delete hmod;
return ihipLogStatus(ret);
delete hmod; // The ihipModule_t dtor will clean everything up.
hmod = nullptr;
return ihipLogStatus(hipSuccess);
}
hipError_t ihipModuleGetSymbol(hipFunction_t *func, hipModule_t hmod, const char *name)
{
auto ctx = ihipGetTlsDefaultCtx();
hipError_t ret = hipSuccess;
if (name == nullptr){
return (hipErrorInvalidValue);
}
if (ctx == nullptr){
ret = hipErrorInvalidContext;
} else {
std::string str(name);
for(auto f = hmod->funcTrack.begin(); f != hmod->funcTrack.end(); ++f) {
if((*f)->_name == str) {
*func = *f;
return ret;
}
}
ihipModuleSymbol_t *sym = new ihipModuleSymbol_t;
int deviceId = ctx->getDevice()->_deviceId;
ihipDevice_t *currentDevice = ihipGetDevice(deviceId);
hsa_agent_t gpuAgent = (hsa_agent_t)currentDevice->_hsaAgent;
hsa_status_t status;
hsa_executable_symbol_t symbol;
status = hsa_executable_get_symbol(hmod->executable, NULL, name, gpuAgent, 0, &symbol);
if(status != HSA_STATUS_SUCCESS){
return hipErrorNotFound;
}
status = hsa_executable_symbol_get_info(symbol,
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
&sym->_object);
CHECK_HSA(status, hipErrorNotFound);
status = hsa_executable_symbol_get_info(symbol,
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
&sym->_groupSegmentSize);
CHECK_HSA(status, hipErrorNotFound);
status = hsa_executable_symbol_get_info(symbol,
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
&sym->_privateSegmentSize);
CHECK_HSA(status, hipErrorNotFound);
sym->_name = name;
*func = sym;
hmod->funcTrack.push_back(*func);
}
return ret;
}
hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod,
const char *name){
HIP_INIT_API(hfunc, hmod, name);
return ihipLogStatus(ihipModuleGetSymbol(hfunc, hmod, name));
}
hipError_t ihipModuleLaunchKernel(hipFunction_t f,
uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ,
@@ -621,45 +278,11 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f,
namespace
{
struct Agent_global {
std::string name;
string name;
hipDeviceptr_t address;
std::uint32_t byte_cnt;
uint32_t byte_cnt;
};
inline
void* address(hsa_executable_symbol_t x)
{
void* r = nullptr;
hsa_executable_symbol_get_info(
x, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &r);
return r;
}
inline
std::string name(hsa_executable_symbol_t x)
{
uint32_t sz = 0u;
hsa_executable_symbol_get_info(
x, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &sz);
std::string r(sz, '\0');
hsa_executable_symbol_get_info(
x, HSA_EXECUTABLE_SYMBOL_INFO_NAME, &r.front());
return r;
}
inline
std::uint32_t size(hsa_executable_symbol_t x)
{
std::uint32_t r = 0;
hsa_executable_symbol_get_info(
x, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &r);
return r;
}
inline
void track(const Agent_global& x)
{
@@ -684,7 +307,7 @@ namespace
hc::am_memtracker_update(x.address, device->_deviceId, 0u);
}
template<typename Container = std::vector<Agent_global>>
template<typename Container = vector<Agent_global>>
inline
hsa_status_t copy_agent_global_variables(
hsa_executable_t, hsa_agent_t, hsa_executable_symbol_t x, void* out)
@@ -709,56 +332,58 @@ namespace
{
auto ctx = ihipGetTlsDefaultCtx();
if (!ctx) throw std::runtime_error{"No active HIP context."};
if (!ctx) throw runtime_error{"No active HIP context."};
auto device = ctx->getDevice();
if (!device) throw std::runtime_error{"No device available for HIP."};
if (!device) throw runtime_error{"No device available for HIP."};
ihipDevice_t *currentDevice = ihipGetDevice(device->_deviceId);
if (!currentDevice) {
throw std::runtime_error{"No active device for HIP"};
}
if (!currentDevice) throw runtime_error{"No active device for HIP."};
return currentDevice->_hsaAgent;
}
inline
std::vector<Agent_global> read_agent_globals(hipModule_t hmodule)
vector<Agent_global> read_agent_globals(
hsa_agent_t agent, hsa_executable_t executable)
{
std::vector<Agent_global> r;
vector<Agent_global> r;
hsa_executable_iterate_agent_symbols(
hmodule->executable, this_agent(), copy_agent_global_variables, &r);
executable, agent, copy_agent_global_variables, &r);
return r;
}
}
hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes,
hipModule_t hmod, const char* name)
{
HIP_INIT_API(dptr, bytes, hmod, name);
hipError_t ret = hipSuccess;
if(dptr == NULL || bytes == NULL){
return ihipLogStatus(hipErrorInvalidValue);
template<typename ForwardIterator>
pair<hipDeviceptr_t, size_t> read_global_description(
ForwardIterator f, ForwardIterator l, const char* name)
{
const auto it = std::find_if(
f, l, [=](const Agent_global& x) { return x.name == name; });
return it == l ?
make_pair(nullptr, 0u) : make_pair(it->address, it->byte_cnt);
}
if(name == NULL || hmod == NULL){
return ihipLogStatus(hipErrorNotInitialized);
}
else{
static std::unordered_map<
hipModule_t, std::vector<Agent_global>> agent_globals;
hipError_t read_agent_global_from_module(
hipDeviceptr_t *dptr,
size_t* bytes,
hipModule_t hmod,
const char* name)
{
static unordered_map<hipModule_t, vector<Agent_global>> agent_globals;
// TODO: this is not particularly robust.
if (agent_globals.count(hmod) == 0) {
static std::mutex mtx;
std::lock_guard<std::mutex> lck{mtx};
static mutex mtx;
lock_guard<mutex> lck{mtx};
if (agent_globals.count(hmod) == 0) {
agent_globals.emplace(hmod, read_agent_globals(hmod));
agent_globals.emplace(
hmod, read_agent_globals(this_agent(), hmod->executable));
}
}
@@ -766,71 +391,163 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes,
// It will have to be properly fleshed out in the future.
const auto it0 = agent_globals.find(hmod);
if (it0 == agent_globals.cend()) {
throw std::runtime_error{"agent_globals data structure corrupted."};
throw runtime_error{"agent_globals data structure corrupted."};
}
const auto it1 = std::find_if(
it0->second.cbegin(),
it0->second.cend(),
[=](const Agent_global& x) { return x.name == name; });
tie(*dptr, *bytes) = read_global_description(
it0->second.cbegin(), it0->second.cend(), name);
if (it1 == it0->second.cend()) return ihipLogStatus(hipErrorNotFound);
*dptr = it1->address;
*bytes = it1->byte_cnt;
return ihipLogStatus(hipSuccess);
return dptr ? hipSuccess : hipErrorNotFound;
}
hipError_t read_agent_global_from_process(
hipDeviceptr_t *dptr, size_t* bytes, const char* name)
{
static unordered_map<hsa_agent_t, vector<Agent_global>> agent_globals;
static std::once_flag f;
call_once(f, []() {
for (auto&& agent_executables : hip_impl::executables()) {
vector<Agent_global> tmp0;
for (auto&& executable : agent_executables.second) {
auto tmp1 = read_agent_globals(
agent_executables.first, executable);
tmp0.insert(
tmp0.end(),
make_move_iterator(tmp1.begin()),
make_move_iterator(tmp1.end()));
}
agent_globals.emplace(agent_executables.first, move(tmp0));
}
});
const auto it = agent_globals.find(this_agent());
if (it == agent_globals.cend()) return hipErrorNotInitialized;
tie(*dptr, *bytes) = read_global_description(
it->second.cbegin(), it->second.cend(), name);
return dptr ? hipSuccess : hipErrorNotFound;
}
hsa_executable_symbol_t find_kernel_by_name(
hsa_executable_t executable, const char* kname)
{
pair<const char*, hsa_executable_symbol_t> r{kname, {}};
hsa_executable_iterate_agent_symbols(
executable,
this_agent(),
[](hsa_executable_t, hsa_agent_t, hsa_executable_symbol_t x, void* s) {
auto p =
static_cast<pair<const char*, hsa_executable_symbol_t>*>(s);
if (type(x) != HSA_SYMBOL_KIND_KERNEL) {
return HSA_STATUS_SUCCESS;
}
if (name(x) != p->first) return HSA_STATUS_SUCCESS;
p->second = x;
return HSA_STATUS_INFO_BREAK;
}, &r);
return r.second;
}
string read_elf_file_as_string(const void* file)
{ // Precondition: file points to an ELF image that was BITWISE loaded
// into process accessible memory, and not one loaded by
// the loader. This is because in the latter case
// alignment may differ, which will break the size
// computation.
// the image is Elf64, and matches endianness i.e. it is
// Little Endian.
if (!file) return {};
auto h = static_cast<const Elf64_Ehdr*>(file);
auto s = static_cast<const char*>(file);
// This assumes the common case of SHT being the last part of the ELF.
auto sz = sizeof(Elf64_Ehdr) + h->e_shoff + h->e_shentsize * h->e_shnum;
return string{s, s + sz};
}
} // Anonymous namespace, internal linkage.
hipError_t ihipModuleGetFunction(
hipFunction_t *func, hipModule_t hmod, const char *name)
{
HIP_INIT_API(func, hmod, name);
if (!func || !name) return ihipLogStatus(hipErrorInvalidValue);
auto ctx = ihipGetTlsDefaultCtx();
if (!ctx) return ihipLogStatus(hipErrorInvalidContext);
hipError_t ret = hipSuccess;
*func = new ihipModuleSymbol_t;
if (!*func) return ihipLogStatus(hipErrorInvalidValue);
auto kernel = find_kernel_by_name(hmod->executable, name);
if (kernel.handle == 0u) return ihipLogStatus(hipErrorNotFound);
(*func)->_object = kernel_object(kernel);
(*func)->_groupSegmentSize = group_size(kernel);
(*func)->_privateSegmentSize = private_size(kernel);
(*func)->_name = name;
return ihipLogStatus(hipSuccess);
}
hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod,
const char *name){
HIP_INIT_API(hfunc, hmod, name);
return ihipLogStatus(ihipModuleGetFunction(hfunc, hmod, name));
}
hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes,
hipModule_t hmod, const char* name)
{
HIP_INIT_API(dptr, bytes, hmod, name);
if(!dptr || !bytes) return ihipLogStatus(hipErrorInvalidValue);
if(!name) return ihipLogStatus(hipErrorNotInitialized);
const auto r = hmod ?
read_agent_global_from_module(dptr, bytes, hmod, name) :
read_agent_global_from_process(dptr, bytes, name);
return ihipLogStatus(r);
}
hipError_t hipModuleLoadData(hipModule_t *module, const void *image)
{
HIP_INIT_API(module, image);
hipError_t ret = hipSuccess;
if(image == NULL || module == NULL){
return ihipLogStatus(hipErrorNotInitialized);
} else {
auto ctx = ihipGetTlsDefaultCtx();
*module = new ihipModule_t;
int deviceId = ctx->getDevice()->_deviceId;
ihipDevice_t *currentDevice = ihipGetDevice(deviceId);
void *p;
uint64_t size = ElfSize(image);
hsa_agent_t agent = currentDevice->_hsaAgent;
hsa_region_t sysRegion;
hsa_status_t status = hsa_agent_iterate_regions(agent, hipdrv::findSystemRegions, &sysRegion);
status = hsa_memory_allocate(sysRegion, size, (void**)&p);
if (!module) return ihipLogStatus(hipErrorInvalidValue);
if(status != HSA_STATUS_SUCCESS){
return ihipLogStatus(hipErrorOutOfMemory);
}
*module = new ihipModule_t;
char *ptr = (char*)p;
if(!ptr){
return ihipLogStatus(hipErrorOutOfMemory);
}
(*module)->ptr = p;
(*module)->size = size;
auto ctx = ihipGetTlsDefaultCtx();
if (!ctx) return ihipLogStatus(hipErrorInvalidContext);
memcpy(ptr, image, size);
hsa_executable_create_alt(
HSA_PROFILE_FULL,
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
nullptr,
&(*module)->executable);
status = hsa_code_object_deserialize(ptr, size, NULL, &(*module)->object);
(*module)->executable = hip_impl::load_executable(
read_elf_file_as_string(image), (*module)->executable, this_agent());
if(status != HSA_STATUS_SUCCESS){
return ihipLogStatus(hipErrorSharedObjectInitFailed);
}
status = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, NULL, &(*module)->executable);
CHECKLOG_HSA(status, hipErrorNotInitialized);
status = hsa_executable_load_code_object((*module)->executable, agent, (*module)->object, NULL);
CHECKLOG_HSA(status, hipErrorNotInitialized);
status = hsa_executable_freeze((*module)->executable, NULL);
CHECKLOG_HSA(status, hipErrorNotInitialized);
}
return ihipLogStatus(ret);
return ihipLogStatus(
(*module)->executable.handle ? hipSuccess : hipErrorUnknown);
}
hipError_t hipModuleLoadDataEx(hipModule_t *module, const void *image, unsigned int numOptions, hipJitOption *options, void **optionValues)
@@ -838,21 +555,20 @@ hipError_t hipModuleLoadDataEx(hipModule_t *module, const void *image, unsigned
return hipModuleLoadData(module, image);
}
hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const char* name)
hipError_t hipModuleGetTexRef(
textureReference** texRef, hipModule_t hmod, const char* name)
{
HIP_INIT_API(texRef, hmod, name);
hipError_t ret = hipErrorNotFound;
if(texRef == NULL){
ret = hipErrorInvalidValue;
} else {
if(name == NULL || hmod == NULL){
ret = hipErrorNotInitialized;
} else{
const auto it = hmod->coGlobals.find(name);
if (it == hmod->coGlobals.end()) return ihipLogStatus(hipErrorInvalidValue);
*texRef = reinterpret_cast<textureReference*>(it->second);
ret = hipSuccess;
}
}
return ihipLogStatus(ret);
if(!texRef) return ihipLogStatus(hipErrorInvalidValue);
if(!hmod || !name) return ihipLogStatus(hipErrorNotInitialized);
const auto it = globals().find(name);
if (it == globals().end()) return ihipLogStatus(hipErrorInvalidValue);
*texRef = static_cast<textureReference*>(it->second.get());
return ihipLogStatus(hipSuccess);
}
+112
Vedi File
@@ -0,0 +1,112 @@
/*
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include <hsa/hsa.h>
#include <cstdint>
#include <string>
namespace hip_impl
{
inline
void* address(hsa_executable_symbol_t x)
{
void* r = nullptr;
hsa_executable_symbol_get_info(
x, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &r);
return r;
}
inline
hsa_agent_t agent(hsa_executable_symbol_t x)
{
hsa_agent_t r = {};
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_AGENT, &r);
return r;
}
inline
std::uint32_t group_size(hsa_executable_symbol_t x)
{
std::uint32_t r = 0u;
hsa_executable_symbol_get_info(
x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &r);
return r;
}
inline
std::uint64_t kernel_object(hsa_executable_symbol_t x)
{
std::uint64_t r = 0u;
hsa_executable_symbol_get_info(
x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &r);
return r;
}
inline
std::string name(hsa_executable_symbol_t x)
{
std::uint32_t sz = 0u;
hsa_executable_symbol_get_info(
x, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &sz);
std::string r(sz, '\0');
hsa_executable_symbol_get_info(
x, HSA_EXECUTABLE_SYMBOL_INFO_NAME, &r.front());
return r;
}
inline
std::uint32_t private_size(hsa_executable_symbol_t x)
{
std::uint32_t r = 0u;
hsa_executable_symbol_get_info(
x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &r);
return r;
}
inline
std::uint32_t size(hsa_executable_symbol_t x)
{
std::uint32_t r = 0;
hsa_executable_symbol_get_info(
x, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &r);
return r;
}
inline
hsa_symbol_kind_t type(hsa_executable_symbol_t x)
{
hsa_symbol_kind_t r = {};
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &r);
return r;
}
}
+97
Vedi File
@@ -0,0 +1,97 @@
/*
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
// Internal header, do not percolate upwards.
#include "hip_hcc_internal.h"
#include "hc.hpp"
#include "trace_helper.h"
#include <iostream>
#include <sstream>
namespace hip_impl
{
hc::accelerator_view lock_stream_hip_(
hipStream_t& stream, void*& locked_stream)
{ // This allocated but does not take ownership of locked_stream. If it is
// not deleted elsewhere it will leak.
using L = decltype(stream->lockopen_preKernelCommand());
HIP_INIT();
stream = ihipSyncAndResolveStream(stream);
locked_stream = new L{stream->lockopen_preKernelCommand()};
return (*static_cast<L*>(locked_stream))->_av;
}
void print_prelaunch_trace_(
const char* kernel_name,
dim3 num_blocks,
dim3 dim_blocks,
int group_mem_bytes,
hipStream_t stream)
{
if ((HIP_TRACE_API & (1 << TRACE_KCMD)) ||
HIP_PROFILE_API ||
(COMPILE_HIP_DB && (HIP_TRACE_API & (1<<TRACE_ALL)))) {
std::stringstream os;
os << tls_tidInfo.tid() << "." << tls_tidInfo.apiSeqNum()
<< " hipLaunchKernel '" << kernel_name << "'"
<< " gridDim:" << num_blocks
<< " groupDim:" << dim_blocks
<< " sharedMem:+" << group_mem_bytes
<< " " << *stream;
if (HIP_PROFILE_API == 0x1) {
std::string shortAtpString("hipLaunchKernel:");
shortAtpString += kernel_name;
MARKER_BEGIN(shortAtpString.c_str(), "HIP");
} else if (HIP_PROFILE_API == 0x2) {
MARKER_BEGIN(os.str().c_str(), "HIP");
}
if (COMPILE_HIP_DB && HIP_TRACE_API) {
std::string fullStr;
recordApiTrace(&fullStr, os.str());
}
}
}
void unlock_stream_hip_(
hipStream_t stream,
void* locked_stream,
const char* kernel_name,
hc::accelerator_view* acc_v)
{ // Precondition: acc_v is the accelerator_view associated with stream
// which is guarded by locked_stream;
// locked_stream is deletable.
using L = decltype(stream->lockopen_preKernelCommand());
stream->lockclose_postKernelCommand(kernel_name, acc_v);
delete static_cast<L*>(locked_stream);
locked_stream = nullptr;
if(HIP_PROFILE_API) {
MARKER_END();
}
}
}
+3 -8
Vedi File
@@ -84,7 +84,7 @@ __device__ float erfcf(float x)
}
__device__ float erfcinvf(float y)
{
return __hip_erfinvf(1 - y);
return hc::precise_math::erfcinvf(y);
}
__device__ float erfcxf(float x)
{
@@ -96,7 +96,7 @@ __device__ float erff(float x)
}
__device__ float erfinvf(float y)
{
return __hip_erfinvf(y);
return hc::precise_math::erfinvf(y);
}
__device__ float exp10f(float x)
{
@@ -192,12 +192,7 @@ __device__ float ldexpf(float x, int exp)
}
__device__ float lgammaf(float x)
{
float val = 0.0f;
float y = x - 1;
while(y > 0){
val += logf(y--);
}
return val;
return hc::precise_math::lgammaf(x);
}
__device__ long long int llrintf(float x)
{
+494
Vedi File
@@ -0,0 +1,494 @@
#include "../include/hip/hcc_detail/program_state.hpp"
#include "../include/hip/hcc_detail/code_object_bundle.hpp"
#include "hip_hcc_internal.h"
#include "hsa_helpers.hpp"
#include "trace_helper.h"
#include "elfio/elfio.hpp"
#include <link.h>
#include <hsa/hsa.h>
#include <hsa/hsa_ext_amd.h>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <memory>
#include <mutex>
#include <sstream>
#include <stdexcept>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
using namespace ELFIO;
using namespace hip_impl;
using namespace std;
namespace std
{
template<>
struct hash<hsa_isa_t> {
size_t operator()(hsa_isa_t x) const
{
return hash<decltype(x.handle)>{}(x.handle);
}
};
}
inline
constexpr
bool operator==(hsa_isa_t x, hsa_isa_t y)
{
return x.handle == y.handle;
}
namespace
{
struct Symbol {
std::string name;
ELFIO::Elf64_Addr value = 0;
ELFIO::Elf_Xword size = 0;
ELFIO::Elf_Half sect_idx = 0;
std::uint8_t bind = 0;
std::uint8_t type = 0;
std::uint8_t other = 0;
};
inline
Symbol read_symbol(const symbol_section_accessor& section, unsigned int idx)
{
assert(idx < section.get_symbols_num());
Symbol r;
section.get_symbol(
idx, r.name, r.value, r.size, r.bind, r.type, r.sect_idx, r.other);
return r;
}
template<typename P>
inline
section* find_section_if(elfio& reader, P p)
{
const auto it = find_if(
reader.sections.begin(), reader.sections.end(), move(p));
return it != reader.sections.end() ? *it : nullptr;
}
vector<string> copy_names_of_undefined_symbols(
const symbol_section_accessor& section)
{
vector<string> r;
for (auto i = 0u; i != section.get_symbols_num(); ++i) {
// TODO: this is boyscout code, caching the temporaries
// may be of worth.
auto tmp = read_symbol(section, i);
if (tmp.sect_idx == SHN_UNDEF && !tmp.name.empty()) {
r.push_back(std::move(tmp.name));
}
}
return r;
}
const std::unordered_map<
std::string,
std::pair<ELFIO::Elf64_Addr, ELFIO::Elf_Xword>>& symbol_addresses()
{
static unordered_map<string, pair<Elf64_Addr, Elf_Xword>> r;
static once_flag f;
call_once(f, []() {
dl_iterate_phdr([](dl_phdr_info* info, size_t, void*) {
static constexpr const char self[] = "/proc/self/exe";
elfio reader;
static unsigned int iter = 0u;
if (reader.load(!iter ? self : info->dlpi_name)) {
auto it = find_section_if(
reader, [](const class section* x) {
return x->get_type() == SHT_SYMTAB;
});
if (it) {
const symbol_section_accessor symtab{reader, it};
for (auto i = 0u; i != symtab.get_symbols_num(); ++i) {
auto tmp = read_symbol(symtab, i);
if (tmp.type == STT_OBJECT &&
tmp.sect_idx != SHN_UNDEF) {
const auto addr =
tmp.value + (iter ? info->dlpi_addr : 0);
r.emplace(
move(tmp.name), make_pair(addr, tmp.size));
}
}
}
++iter;
}
return 0;
}, nullptr);
});
return r;
}
void associate_code_object_symbols_with_host_allocation(
const elfio& reader,
section* code_object_dynsym,
hsa_agent_t agent,
hsa_executable_t executable)
{
if (!code_object_dynsym) return;
const auto undefined_symbols = copy_names_of_undefined_symbols(
symbol_section_accessor{reader, code_object_dynsym});
for (auto&& x : undefined_symbols) {
if (globals().find(x) != globals().cend()) return;
const auto it1 = symbol_addresses().find(x);
if (it1 == symbol_addresses().cend()) {
throw runtime_error{"Global symbol: " + x + " is undefined."};
}
static mutex mtx;
lock_guard<mutex> lck{mtx};
if (globals().find(x) != globals().cend()) return;
void* p = nullptr;
hsa_amd_memory_lock(
reinterpret_cast<void*>(it1->second.first),
it1->second.second,
nullptr, // All agents.
0,
&p);
hsa_executable_agent_global_variable_define(
executable, agent, x.c_str(), p);
globals().emplace(x, RAII_global{p, hsa_amd_memory_unlock});
}
}
vector<uint8_t> code_object_blob_for_process()
{
static constexpr const char self[] = "/proc/self/exe";
static constexpr const char kernel_section[] = ".kernel";
elfio reader;
if (!reader.load(self)) {
throw runtime_error{"Failed to load ELF file for current process."};
}
auto kernels = find_section_if(reader, [](const section* x) {
return x->get_name() == kernel_section;
});
vector<uint8_t> r;
if (kernels) {
r.insert(
r.end(),
kernels->get_data(),
kernels->get_data() + kernels->get_size());
}
return r;
}
const unordered_map<hsa_isa_t, vector<vector<uint8_t>>>& code_object_blobs()
{
static unordered_map<hsa_isa_t, vector<vector<uint8_t>>> r;
static once_flag f;
call_once(f, []() {
static vector<vector<uint8_t>> blobs{
code_object_blob_for_process()};
dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void*) {
elfio tmp;
if (tmp.load(info->dlpi_name)) {
const auto it = find_section_if(tmp, [](const section* x) {
return x->get_name() == ".kernel";
});
if (it) blobs.emplace_back(
it->get_data(), it->get_data() + it->get_size());
}
return 0;
}, nullptr);
for (auto&& blob : blobs) {
Bundled_code_header tmp{blob};
if (valid(tmp)) {
for (auto&& bundle : bundles(tmp)) {
r[triple_to_hsa_isa(bundle.triple)].push_back(
bundle.blob);
}
}
}
});
return r;
}
vector<pair<uintptr_t, string>> function_names_for(
const elfio& reader, section* symtab)
{
vector<pair<uintptr_t, string>> r;
symbol_section_accessor symbols{reader, symtab};
for (auto i = 0u; i != symbols.get_symbols_num(); ++i) {
// TODO: this is boyscout code, caching the temporaries
// may be of worth.
auto tmp = read_symbol(symbols, i);
if (tmp.type == STT_FUNC &&
tmp.sect_idx != SHN_UNDEF &&
!tmp.name.empty()) {
r.emplace_back(tmp.value, tmp.name);
}
}
return r;
}
const vector<pair<uintptr_t, string>>& function_names_for_process()
{
static constexpr const char self[] = "/proc/self/exe";
static vector<pair<uintptr_t, string>> r;
static once_flag f;
call_once(f, []() {
elfio reader;
if (!reader.load(self)) {
throw runtime_error{
"Failed to load the ELF file for the current process."};
}
auto symtab = find_section_if(reader, [](const section* x) {
return x->get_type() == SHT_SYMTAB;
});
if (symtab) r = function_names_for(reader, symtab);
});
return r;
}
const unordered_map<string, vector<hsa_executable_symbol_t>>& kernels()
{
static unordered_map<string, vector<hsa_executable_symbol_t>> r;
static once_flag f;
call_once(f, []() {
static const auto copy_kernels = [](
hsa_executable_t, hsa_agent_t, hsa_executable_symbol_t s, void*) {
if (type(s) == HSA_SYMBOL_KIND_KERNEL) r[name(s)].push_back(s);
return HSA_STATUS_SUCCESS;
};
for (auto&& agent_executables : executables()) {
for (auto&& executable : agent_executables.second) {
hsa_executable_iterate_agent_symbols(
executable,
agent_executables.first,
copy_kernels,
nullptr);
}
}
});
return r;
}
void load_code_object_and_freeze_executable(
const string& file, hsa_agent_t agent, hsa_executable_t executable)
{ // TODO: the following sequence is inefficient, should be refactored
// into a single load of the file and subsequent ELFIO
// processing.
static const auto cor_deleter = [](hsa_code_object_reader_t* p) {
if (p) {
hsa_code_object_reader_destroy(*p);
delete p;
}
};
using RAII_code_reader = unique_ptr<
hsa_code_object_reader_t, decltype(cor_deleter)>;
if (!file.empty()) {
RAII_code_reader tmp{new hsa_code_object_reader_t, cor_deleter};
hsa_code_object_reader_create_from_memory(
file.data(), file.size(), tmp.get());
hsa_executable_load_agent_code_object(
executable, agent, *tmp, nullptr, nullptr);
hsa_executable_freeze(executable, nullptr);
static vector<RAII_code_reader> code_readers;
static mutex mtx;
lock_guard<mutex> lck{mtx};
code_readers.push_back(move(tmp));
}
}
}
namespace hip_impl
{
const unordered_map<hsa_agent_t, vector<hsa_executable_t>>& executables()
{ // TODO: This leaks the hsa_executable_ts, it should use RAII.
static unordered_map<hsa_agent_t, vector<hsa_executable_t>> r;
static once_flag f;
call_once(f, []() {
static const auto accelerators = hc::accelerator::get_all();
for (auto&& acc : accelerators) {
auto agent = static_cast<hsa_agent_t*>(acc.get_hsa_agent());
if (!agent || !acc.is_hsa_accelerator()) continue;
hsa_agent_iterate_isas(*agent, [](hsa_isa_t x, void* pa) {
const auto it = code_object_blobs().find(x);
if (it != code_object_blobs().cend()) {
hsa_agent_t a = *static_cast<hsa_agent_t*>(pa);
for (auto&& blob : it->second) {
hsa_executable_t tmp = {};
hsa_executable_create_alt(
HSA_PROFILE_FULL,
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
nullptr,
&tmp);
// TODO: this is massively inefficient and only
// meant for illustration.
string blob_to_str{blob.cbegin(), blob.cend()};
tmp = load_executable(blob_to_str, tmp, a);
if (tmp.handle) r[a].push_back(tmp);
}
}
return HSA_STATUS_SUCCESS;
}, agent);
}
});
return r;
}
const unordered_map<uintptr_t, string>& function_names()
{
static unordered_map<uintptr_t, string> r{
function_names_for_process().cbegin(),
function_names_for_process().cend()};
static once_flag f;
call_once(f, []() {
dl_iterate_phdr([](dl_phdr_info* info, size_t, void*) {
elfio tmp;
if (tmp.load(info->dlpi_name)) {
const auto it = find_section_if(tmp, [](const section* x) {
return x->get_type() == SHT_SYMTAB;
});
if (it) {
auto n = function_names_for(tmp, it);
for (auto&& f : n) f.first += info->dlpi_addr;
r.insert(
make_move_iterator(n.begin()),
make_move_iterator(n.end()));
}
}
return 0;
}, nullptr);
});
return r;
}
const unordered_map<
uintptr_t, vector<pair<hsa_agent_t, Kernel_descriptor>>>& functions()
{
static unordered_map<
uintptr_t, vector<pair<hsa_agent_t, Kernel_descriptor>>> r;
static once_flag f;
call_once(f, []() {
for (auto&& function : function_names()) {
const auto it = kernels().find(function.second);
if (it != kernels().cend()) {
for (auto&& kernel_symbol : it->second) {
r[function.first].emplace_back(
agent(kernel_symbol),
Kernel_descriptor{
kernel_object(kernel_symbol),
group_size(kernel_symbol),
private_size(kernel_symbol),
it->first});
}
}
}
});
return r;
}
unordered_map<string, RAII_global>& globals()
{
static unordered_map<string, RAII_global> r;
static once_flag f;
call_once(f, []() { r.reserve(symbol_addresses().size()); });
return r;
}
hsa_executable_t load_executable(
const string& file, hsa_executable_t executable, hsa_agent_t agent)
{
elfio reader;
stringstream tmp{file};
if (!reader.load(tmp)) return hsa_executable_t{};
const auto code_object_dynsym =
find_section_if(reader, [](const ELFIO::section* x) {
return x->get_type() == SHT_DYNSYM;
});
associate_code_object_symbols_with_host_allocation(
reader, code_object_dynsym, agent, executable);
load_code_object_and_freeze_executable(file, agent, executable);
return executable;
}
} // Namespace hip_impl.
+1 -1
Vedi File
@@ -23,7 +23,7 @@ __global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In)
__global__ void set(hipLaunchParm lp, uint32_t *ptr, uint8_t val, size_t size)
{
int tx = threadIdx.x;
memset(ptr + tx, val, (sizeof(uint32_t)*(size/LEN)));
memset(ptr + tx, val, sizeof(uint32_t));
}
int main()
@@ -18,8 +18,8 @@ THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../test_common.cpp
* RUN: %t
* XXBUILD: %t %s ../test_common.cpp
* XXRUN: %t
* HIT_END
*/
+30 -15
Vedi File
@@ -159,11 +159,16 @@ bool dataTypesRun(){
HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice));
hipLaunchKernel(vectoradd_float,
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0, 0,
deviceA ,deviceB ,WIDTH ,HEIGHT);
hipLaunchKernel(
vectoradd_float,
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0,
0,
deviceA,
static_cast<const T*>(deviceB),
WIDTH,
HEIGHT);
HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(T), hipMemcpyDeviceToHost));
@@ -221,11 +226,16 @@ bool dataTypesRun2(){
HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T)));
HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice));
hipLaunchKernel(vectoradd_float,
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0, 0,
deviceA ,deviceB,WIDTH ,HEIGHT);
hipLaunchKernel(
vectoradd_float,
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0,
0,
deviceA,
static_cast<const T*>(deviceB),
WIDTH,
HEIGHT);
HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(T), hipMemcpyDeviceToHost));
@@ -281,11 +291,16 @@ bool dataTypesRun4(){
HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice));
hipLaunchKernel(vectoradd_float,
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0, 0,
deviceA ,deviceB ,WIDTH ,HEIGHT);
hipLaunchKernel(
vectoradd_float,
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0,
0,
deviceA,
static_cast<const T*>(deviceB),
WIDTH,
HEIGHT);
HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(T), hipMemcpyDeviceToHost));
@@ -69,7 +69,16 @@ int main(int argc, char *argv[])
// Record the start event
HIPCHECK (hipEventRecord(start, NULL));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N);
hipLaunchKernel(
HipTest::vectorADD,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const float*>(A_d),
static_cast<const float*>(B_d),
C_d,
N);
HIPCHECK (hipEventRecord(stop, NULL));
+10 -1
Vedi File
@@ -77,7 +77,16 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_
HIPCHECK(hipEventRecord(timingDisabled, stream));
// sandwhich a kernel:
HIPCHECK(hipEventRecord(start, stream));
hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, stream, C_d, C_h, numElements, count);
hipLaunchKernelGGL(
HipTest::addCountReverse,
dim3(blocks),
dim3(threadsPerBlock),
0,
stream,
static_cast<const int*>(C_d),
C_h,
numElements,
count);
HIPCHECK(hipEventRecord(stop, stream));
+10 -1
Vedi File
@@ -243,7 +243,16 @@ void memcpytest2(DeviceMemory<T> *dmem, HostMemory<T> *hmem, size_t numElements,
HIPCHECK ( hipMemcpy(dmem->B_d(), hmem->B_h(), sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
}
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, dmem->A_d(), dmem->B_d(), dmem->C_d(), numElements);
hipLaunchKernel(
HipTest::vectorADD,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const T*>(dmem->A_d()),
static_cast<const T*>(dmem->B_d()),
dmem->C_d(),
numElements);
if (useDeviceToDevice) {
// Do an extra device-to-device copy here to mix things up:
@@ -49,21 +49,39 @@ int main()
HIPCHECK(hipMalloc(&Y_d,Nbytes));
HIPCHECK(hipMalloc(&Z_d,Nbytes));
HIPCHECK(hipSetDevice(0));
HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N);
hipLaunchKernel(
HipTest::vectorADD,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const int*>(A_d),
static_cast<const int*>(B_d),
C_d,
N);
HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
HIPCHECK(hipDeviceSynchronize());
HipTest::checkVectorADD(A_h, B_h, C_h, N);
HIPCHECK(hipSetDevice(1));
HIPCHECK(hipMemcpyDtoD(X_d, A_d, Nbytes));
HIPCHECK(hipMemcpyDtoD(X_d, A_d, Nbytes));
HIPCHECK(hipMemcpyDtoD(Y_d, B_d, Nbytes));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N);
hipLaunchKernel(
HipTest::vectorADD,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const int*>(X_d),
static_cast<const int*>(Y_d),
Z_d,
N);
HIPCHECK(hipMemcpyDtoH(C_h, Z_d, Nbytes));
HIPCHECK(hipDeviceSynchronize());
HipTest::checkVectorADD(A_h, B_h, C_h, N);
@@ -73,8 +91,8 @@ int main()
HIPCHECK(hipFree(Y_d));
HIPCHECK(hipFree(Z_d));
}
passed();
}
@@ -50,25 +50,43 @@ int main()
HIPCHECK(hipMalloc(&Y_d,Nbytes));
HIPCHECK(hipMalloc(&Z_d,Nbytes));
HIPCHECK(hipSetDevice(0));
HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N);
hipLaunchKernel(
HipTest::vectorADD,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const int*>(A_d),
static_cast<const int*>(B_d),
C_d,
N);
HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
HIPCHECK(hipDeviceSynchronize());
HipTest::checkVectorADD(A_h, B_h, C_h, N);
HIPCHECK(hipStreamCreate(&s));
HIPCHECK(hipStreamCreate(&s));
HIPCHECK(hipSetDevice(1));
HIPCHECK(hipMemcpyDtoDAsync(X_d, A_d, Nbytes, s));
HIPCHECK(hipMemcpyDtoDAsync(X_d, A_d, Nbytes, s));
HIPCHECK(hipMemcpyDtoDAsync(Y_d, B_d, Nbytes, s));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N);
hipLaunchKernel(
HipTest::vectorADD,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const int*>(X_d),
static_cast<const int*>(Y_d),
Z_d,
N);
HIPCHECK(hipMemcpyDtoHAsync(C_h, Z_d, Nbytes, s));
HIPCHECK(hipStreamSynchronize(s));
HIPCHECK(hipDeviceSynchronize());
HipTest::checkVectorADD(A_h, B_h, C_h, N);
HIPCHECK(hipStreamDestroy(s));
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
@@ -78,7 +96,7 @@ int main()
}
passed();
}
@@ -48,24 +48,42 @@ int main()
HIPCHECK(hipMalloc(&X_d,Nbytes));
HIPCHECK(hipMalloc(&Y_d,Nbytes));
HIPCHECK(hipMalloc(&Z_d,Nbytes));
HIPCHECK(hipSetDevice(0));
HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N);
hipLaunchKernel(
HipTest::vectorADD,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const int*>(A_d),
static_cast<const int*>(B_d),
C_d,
N);
HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
HIPCHECK(hipDeviceSynchronize());
HipTest::checkVectorADD(A_h, B_h, C_h, N);
HIPCHECK(hipSetDevice(1));
hipMemcpyPeer(X_d, 1, A_d, 0, Nbytes); //this call is eqv to hipMemcpy(hipMemcpyD2D) which goes via stg bufs.
hipMemcpyPeer(Y_d, 1, B_d, 0, Nbytes);
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N);
hipLaunchKernel(
HipTest::vectorADD,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const int*>(X_d),
static_cast<const int*>(Y_d),
Z_d,
N);
HIPCHECK(hipMemcpy(C_h, Z_d, Nbytes, hipMemcpyDeviceToHost));
HIPCHECK(hipDeviceSynchronize());
HipTest::checkVectorADD(A_h, B_h, C_h, N);
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
HIPCHECK(hipFree(X_d));
HIPCHECK(hipFree(Y_d));
@@ -74,7 +92,7 @@ int main()
passed();
}
@@ -51,26 +51,44 @@ int main()
HIPCHECK(hipMalloc(&Y_d,Nbytes));
HIPCHECK(hipMalloc(&Z_d,Nbytes));
HIPCHECK(hipSetDevice(0));
HIPCHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
HIPCHECK ( hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N);
hipLaunchKernel(
HipTest::vectorADD,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const int*>(A_d),
static_cast<const int*>(B_d),
C_d,
N);
HIPCHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
HIPCHECK (hipDeviceSynchronize());
HipTest::checkVectorADD(A_h, B_h, C_h, N);
HIPCHECK(hipStreamCreate(&s));
HIPCHECK(hipStreamCreate(&s));
HIPCHECK(hipSetDevice(1));
HIPCHECK(hipMemcpyPeerAsync(X_d, 1, A_d, 0, Nbytes, s));
HIPCHECK(hipMemcpyPeerAsync(Y_d, 1, B_d, 0, Nbytes, s));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N);
hipLaunchKernel(
HipTest::vectorADD,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const int*>(X_d),
static_cast<const int*>(Y_d),
Z_d,
N);
HIPCHECK ( hipMemcpy(C_h, Z_d, Nbytes, hipMemcpyDeviceToHost));
HIPCHECK (hipDeviceSynchronize());
HIPCHECK (hipStreamSynchronize(s));
HipTest::checkVectorADD(A_h, B_h, C_h, N);
HIPCHECK(hipStreamDestroy(s));
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
HIPCHECK(hipFree(X_d));
@@ -79,7 +97,7 @@ int main()
}
passed();
}
@@ -63,7 +63,16 @@ void simpleTest1()
HIPCHECK ( memcopy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
HIPCHECK ( memcopy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N);
hipLaunchKernel(
HipTest::vectorADD,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const int*>(A_d),
static_cast<const int*>(B_d),
C_d,
N);
HIPCHECK ( memcopy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
@@ -90,7 +90,16 @@ void simpleVectorAdd(size_t numElements, int iters, hipStream_t stream)
// This is the null stream?
//hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements);
hipLaunchKernel(HipTest::vectorADDReverse, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements);
hipLaunchKernel(
HipTest::vectorADDReverse,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const T*>(A_d),
static_cast<const T*>(B_d),
C_d,
numElements);
MemTraits<C>::Copy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, stream);
@@ -128,7 +128,17 @@ void Streamer<T>::enqueAsync()
{
printf ("testing: %s numElements=%zu size=%6.2fMB\n", __func__, _numElements, _numElements * sizeof(T) / 1024.0/1024.0);
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements);
hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _B_d, _C_d, _numElements, p_repeat);
hipLaunchKernel(
vectorADDRepeat,
dim3(blocks),
dim3(threadsPerBlock),
0,
_stream,
static_cast<const T*>(_A_d),
static_cast<const T*>(_B_d),
_C_d,
_numElements,
p_repeat);
}
@@ -225,7 +235,17 @@ int main(int argc, char *argv[])
auto lastStreamer = streamers[s - 1];
// Dispatch to NULL stream, should wait for prior async activity to complete before beginning:
hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/);
hipLaunchKernel(
vectorADDRepeat,
dim3(blocks),
dim3(threadsPerBlock),
0,
0/*nullstream*/,
static_cast<const int*>(lastStreamer->_C_d),
static_cast<const int*>(lastStreamer->_C_d),
nullStreamer->_C_d,
numElements,
1/*repeat*/);
if (p_db) {
@@ -257,7 +277,17 @@ int main(int argc, char *argv[])
auto lastStreamer = streamers[s - 1];
// Dispatch to NULL stream, should wait for prior async activity to complete before beginning:
hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/);
hipLaunchKernel(
vectorADDRepeat,
dim3(blocks),
dim3(threadsPerBlock),
0,
0/*nullstream*/,
static_cast<const int*>(lastStreamer->_C_d),
static_cast<const int*>(lastStreamer->_C_d),
nullStreamer->_C_d,
numElements,
1/*repeat*/);
nullStreamer->D2H();
@@ -97,7 +97,16 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode s
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);
// Launch kernel into null stream, should result in C_h == count.
hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, 0 /*stream*/, C_d, C_h, numElements, count);
hipLaunchKernelGGL(
HipTest::addCountReverse,
dim3(blocks),
dim3(threadsPerBlock),
0,
0 /*stream*/,
static_cast<const int*>(C_d),
C_h,
numElements,
count);
HIPCHECK(hipEventRecord(stop, 0/*default*/));
switch (syncMode) {
@@ -18,8 +18,8 @@ THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
* RUN: %t
* ZZZBUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
* ZZZRUN: %t
* HIT_END
*/
@@ -163,9 +163,27 @@ void Streamer<T>::runAsyncAfter(Streamer<T> *depStreamer, bool waitSameStream)
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements);
if (_commandType == COMMAND_ADD_REVERSE) {
hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count);
hipLaunchKernelGGL(
HipTest::addCountReverse,
dim3(blocks),
dim3(threadsPerBlock),
0,
_stream,
static_cast<const T*>(_A_d),
_C_d,
static_cast<int64_t>(_numElements),
static_cast<int>(p_count));
} else if (_commandType == COMMAND_ADD_FORWARD) {
hipLaunchKernelGGL(HipTest::addCount, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count);
hipLaunchKernelGGL(
HipTest::addCount,
dim3(blocks),
dim3(threadsPerBlock),
0,
_stream,
static_cast<const T*>(_A_d),
_C_d,
_numElements,
static_cast<int>(p_count));
} else if (_commandType == COMMAND_COPY) {
HIPCHECK(hipMemcpyAsync(_C_d, _A_d, _numElements * sizeof(T), hipMemcpyDeviceToDevice, _stream));
} else {