Merge pull request #246 from AlexVlx/feature_use_module_based_dispatch_instead_of_pfe

Use natural dispatch syntax
Этот коммит содержится в:
Ben Sander
2017-12-01 14:21:25 -06:00
коммит произвёл GitHub
родитель 2f542c46c8 33bb425013
Коммит 1a6727e2fc
32 изменённых файлов: 2793 добавлений и 1466 удалений
+3 -1
Просмотреть файл
@@ -167,6 +167,7 @@ if(HIP_PLATFORM STREQUAL "hcc")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${HIP_HCC_BUILD_FLAGS}")
set(SOURCE_FILES_RUNTIME
src/code_object_bundle.cpp
src/hip_hcc.cpp
src/hip_context.cpp
src/hip_device.cpp
@@ -179,7 +180,8 @@ if(HIP_PLATFORM STREQUAL "hcc")
src/hip_db.cpp
src/grid_launch.cpp
src/hip_texture.cpp
src/env.cpp)
src/env.cpp
src/program_state.cpp)
set(SOURCE_FILES_DEVICE
src/device_util.cpp
+158
Просмотреть файл
@@ -0,0 +1,158 @@
/*
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include <hsa/hsa.h>
#include <algorithm>
#include <cstdint>
#include <istream>
#include <iterator>
#include <string>
#include <utility>
#include <vector>
namespace hip_impl
{
hsa_isa_t triple_to_hsa_isa(const std::string& triple);
struct Bundled_code {
union {
struct {
std::uint64_t offset;
std::uint64_t bundle_sz;
std::uint64_t triple_sz;
};
std::uint8_t cbuf[
sizeof(offset) + sizeof(bundle_sz) + sizeof(triple_sz)];
};
std::string triple;
std::vector<std::uint8_t> blob;
};
class Bundled_code_header {
// DATA - STATICS
static constexpr const char magic_string_[] =
"__CLANG_OFFLOAD_BUNDLE__";
static constexpr auto magic_string_sz_ = sizeof(magic_string_) - 1;
// DATA
union {
struct {
std::uint8_t bundler_magic_string_[magic_string_sz_];
std::uint64_t bundle_cnt_;
};
std::uint8_t cbuf_[
sizeof(bundler_magic_string_) + sizeof(bundle_cnt_)];
};
std::vector<Bundled_code> bundles_;
// FRIENDS - MANIPULATORS
template<typename RandomAccessIterator>
friend
inline
bool read(
RandomAccessIterator f,
RandomAccessIterator l,
Bundled_code_header& x)
{
if (f == l) return false;
std::copy_n(f, sizeof(x.cbuf_), x.cbuf_);
if (valid(x)) {
x.bundles_.resize(x.bundle_cnt_);
auto it = f + sizeof(x.cbuf_);
for (auto&& y : x.bundles_) {
std::copy_n(it, sizeof(y.cbuf), y.cbuf);
it += sizeof(y.cbuf);
y.triple.insert(y.triple.cend(), it, it + y.triple_sz);
std::copy_n(
f + y.offset, y.bundle_sz, std::back_inserter(y.blob));
it += y.triple_sz;
}
return true;
}
return false;
}
friend
inline
bool read(const std::vector<std::uint8_t>& blob, Bundled_code_header& x)
{
return read(blob.cbegin(), blob.cend(), x);
}
friend
inline
bool read(std::istream& is, Bundled_code_header& x)
{
return read(std::vector<std::uint8_t>{
std::istreambuf_iterator<char>{is},
std::istreambuf_iterator<char>{}},
x);
}
// FRIENDS - ACCESSORS
friend
inline
bool valid(const Bundled_code_header& x)
{
return std::equal(
x.bundler_magic_string_,
x.bundler_magic_string_ + magic_string_sz_,
x.magic_string_);
}
friend
inline
const std::vector<Bundled_code>& bundles(const Bundled_code_header& x)
{
return x.bundles_;
}
public:
// CREATORS
Bundled_code_header() = default;
template<typename RandomAccessIterator>
Bundled_code_header(RandomAccessIterator f, RandomAccessIterator l);
explicit
Bundled_code_header(const std::vector<std::uint8_t>& blob);
Bundled_code_header(const Bundled_code_header&) = default;
Bundled_code_header(Bundled_code_header&&) = default;
~Bundled_code_header() = default;
// MANIPULATORS
Bundled_code_header& operator=(const Bundled_code_header&) = default;
Bundled_code_header& operator=(Bundled_code_header&&) = default;
};
// CREATORS
template<typename I>
Bundled_code_header::Bundled_code_header(I f, I l) : Bundled_code_header{}
{
read(f, l, *this);
}
} // Namespace hip_impl.
+1 -1
Просмотреть файл
@@ -1,5 +1,5 @@
/*
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
+159
Просмотреть файл
@@ -0,0 +1,159 @@
/*
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include "code_object_bundle.hpp"
#include "concepts.hpp"
#include "helpers.hpp"
#include "program_state.hpp"
#include "hc.hpp"
#include "hip/hip_hcc.h"
#include "hip_runtime.h"
#include <cstddef>
#include <cstdint>
#include <functional>
#include <iostream>
#include <mutex>
#include <stdexcept>
#include <string>
#include <tuple>
#include <type_traits>
#include <unordered_map>
#include <utility>
#include <vector>
namespace hip_impl
{
template<
typename T,
typename std::enable_if<std::is_integral<T>{}>::type* = nullptr>
inline
T round_up_to_next_multiple_nonnegative(T x, T y)
{
T tmp = x + y - 1;
return tmp - tmp % y;
}
inline
std::vector<std::uint8_t> make_kernarg()
{
return {};
}
inline
std::vector<std::uint8_t> make_kernarg(std::vector<std::uint8_t> kernarg)
{
return kernarg;
}
template<typename T>
inline
std::vector<std::uint8_t> make_kernarg(std::vector<uint8_t> kernarg, T x)
{
kernarg.resize(
round_up_to_next_multiple_nonnegative(kernarg.size(), alignof(T)) +
sizeof(T));
new (kernarg.data() + kernarg.size() - sizeof(T)) T{std::move(x)};
return kernarg;
}
template<typename T, typename... Ts>
inline
std::vector<std::uint8_t> make_kernarg(
std::vector<std::uint8_t> kernarg, T x, Ts... xs)
{
return make_kernarg(
make_kernarg(std::move(kernarg), std::move(x)), std::move(xs)...);
}
template<typename... Ts>
inline
std::vector<std::uint8_t> make_kernarg(Ts... xs)
{
std::vector<std::uint8_t> kernarg;
kernarg.reserve(sizeof(std::tuple<Ts...>));
return make_kernarg(std::move(kernarg), std::move(xs)...);
}
void hipLaunchKernelGGLImpl(
std::uintptr_t function_address,
const dim3& numBlocks,
const dim3& dimBlocks,
std::uint32_t sharedMemBytes,
hipStream_t stream,
void** kernarg);
} // Namespace hip_impl.
template<typename... Args, typename F = void (*)(Args...)>
inline
void hipLaunchKernelGGL(
F kernel,
const dim3& numBlocks,
const dim3& dimBlocks,
std::uint32_t sharedMemBytes,
hipStream_t stream,
Args... args)
{
auto kernarg = hip_impl::make_kernarg(std::move(args)...);
std::size_t kernarg_size = kernarg.size();
void* config[] = {
HIP_LAUNCH_PARAM_BUFFER_POINTER, kernarg.data(),
HIP_LAUNCH_PARAM_BUFFER_SIZE, &kernarg_size,
HIP_LAUNCH_PARAM_END
};
hip_impl::hipLaunchKernelGGLImpl(
reinterpret_cast<std::uintptr_t>(kernel),
numBlocks,
dimBlocks,
sharedMemBytes,
stream,
&config[0]);
}
template<typename... Args, typename F = void (*)(hipLaunchParm, Args...)>
inline
void hipLaunchKernel(
F kernel,
const dim3& numBlocks,
const dim3& dimBlocks,
std::uint32_t groupMemBytes,
hipStream_t stream,
Args... args)
{
hipLaunchKernelGGL(
kernel,
numBlocks,
dimBlocks,
groupMemBytes,
stream,
hipLaunchParm{},
std::move(args)...);
}
+6 -983
Просмотреть файл
@@ -19,989 +19,12 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#if GENERIC_GRID_LAUNCH == 1
#include "concepts.hpp"
#include "helpers.hpp"
#include "hc.hpp"
#include "hip/hip_hcc.h"
#include "hip_runtime.h"
#include <functional>
#include <iostream>
#include <stdexcept>
#include <type_traits>
#include <utility>
namespace hip_impl
{
namespace
{
struct New_grid_launch_tag {};
struct Old_grid_launch_tag {};
template<typename C, typename D>
class RAII_guard {
D dtor_;
public:
RAII_guard() = default;
RAII_guard(const C& ctor, D dtor) : dtor_{std::move(dtor)}
{
ctor();
}
RAII_guard(const RAII_guard&) = default;
RAII_guard(RAII_guard&&) = default;
RAII_guard& operator=(const RAII_guard&) = default;
RAII_guard& operator=(RAII_guard&&) = default;
~RAII_guard() { dtor_(); }
};
template<typename C, typename D>
RAII_guard<C, D> make_RAII_guard(const C& ctor, D dtor)
{
return RAII_guard<C, D>{ctor, std::move(dtor)};
}
template<FunctionalProcedure F, typename... Ts>
using is_new_grid_launch_t = typename std::conditional<
is_callable<F(Ts...)>{},
New_grid_launch_tag,
Old_grid_launch_tag>::type;
}
// TODO: - dispatch rank should be derived from the domain dimensions passed
// in, and not always assumed to be 3;
template<FunctionalProcedure K, typename... Ts>
requires(Domain<K> == {Ts...})
inline
void grid_launch_hip_impl_(
New_grid_launch_tag,
dim3 num_blocks,
dim3 dim_blocks,
int group_mem_bytes,
const hc::accelerator_view& acc_v,
K k)
{
const auto d = hc::extent<3>{
num_blocks.z * dim_blocks.z,
num_blocks.y * dim_blocks.y,
num_blocks.x * dim_blocks.x}.tile_with_dynamic(
dim_blocks.z,
dim_blocks.y,
dim_blocks.x,
group_mem_bytes);
try {
hc::parallel_for_each(acc_v, d, k);
}
catch (std::exception& ex) {
std::cerr << "Failed in " << __func__ << ", with exception: "
<< ex.what() << std::endl;
throw;
}
}
// TODO: these are workarounds, they should be removed.
hc::accelerator_view lock_stream_hip_(hipStream_t&, void*&);
void print_prelaunch_trace_(const char*, dim3, dim3, int, hipStream_t);
void unlock_stream_hip_(
hipStream_t, void*, const char*, hc::accelerator_view*);
template<FunctionalProcedure K, typename... Ts>
requires(Domain<K> == {Ts...})
inline
void grid_launch_hip_impl_(
New_grid_launch_tag,
dim3 num_blocks,
dim3 dim_blocks,
int group_mem_bytes,
hipStream_t stream,
const char* kernel_name,
K k)
{
void* lck_stream = nullptr;
auto acc_v = lock_stream_hip_(stream, lck_stream);
auto stream_guard = make_RAII_guard(
std::bind(
print_prelaunch_trace_,
kernel_name,
num_blocks,
dim_blocks,
group_mem_bytes,
stream),
std::bind(
unlock_stream_hip_, stream, lck_stream, kernel_name, &acc_v));
try {
grid_launch_hip_impl_(
New_grid_launch_tag{},
std::move(num_blocks),
std::move(dim_blocks),
group_mem_bytes,
acc_v,
std::move(k));
}
catch (std::exception& ex) {
std::cerr << "Failed in " << __func__ << ", with exception: "
<< ex.what() << std::endl;
throw;
}
}
template<FunctionalProcedure K, typename... Ts>
requires(Domain<K> == {hipLaunchParm, Ts...})
inline
void grid_launch_hip_impl_(
Old_grid_launch_tag,
dim3 num_blocks,
dim3 dim_blocks,
int group_mem_bytes,
hipStream_t stream,
K k)
{
grid_launch_hip_impl_(
New_grid_launch_tag{},
std::move(num_blocks),
std::move(dim_blocks),
group_mem_bytes,
std::move(stream),
std::move(k));
}
template<FunctionalProcedure K, typename... Ts>
requires(Domain<K> == {hipLaunchParm, Ts...})
inline
void grid_launch_hip_impl_(
Old_grid_launch_tag,
dim3 num_blocks,
dim3 dim_blocks,
int group_mem_bytes,
hipStream_t stream,
const char* kernel_name,
K k)
{
grid_launch_hip_impl_(
New_grid_launch_tag{},
std::move(num_blocks),
std::move(dim_blocks),
group_mem_bytes,
std::move(stream),
kernel_name,
std::move(k));
}
template<FunctionalProcedure K, typename... Ts>
requires(Domain<K> == {Ts...})
inline
std::enable_if_t<!std::is_function<K>::value> grid_launch_hip_(
dim3 num_blocks,
dim3 dim_blocks,
int group_mem_bytes,
hipStream_t stream,
const char* kernel_name,
K k)
{
grid_launch_hip_impl_(
is_new_grid_launch_t<K, Ts...>{},
std::move(num_blocks),
std::move(dim_blocks),
group_mem_bytes,
std::move(stream),
kernel_name,
std::move(k));
}
template<FunctionalProcedure K, typename... Ts>
requires(Domain<K> == {Ts...})
inline
std::enable_if_t<!std::is_function<K>::value> grid_launch_hip_(
dim3 num_blocks,
dim3 dim_blocks,
int group_mem_bytes,
hipStream_t stream,
K k)
{
grid_launch_hip_impl_(
is_new_grid_launch_t<K, Ts...>{},
std::move(num_blocks),
std::move(dim_blocks),
group_mem_bytes,
std::move(stream),
std::move(k));
}
// TODO: these are temporary and purposefully noisy and disruptive.
#define make_kernel_name_hip(k, n)\
HIP_kernel_functor_name_begin ## _ ## k ## _ ## \
HIP_kernel_functor_name_end ## _ ## n
#define make_kernel_functor_hip_30(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\
p24, p25, p26, p27)\
struct make_kernel_name_hip(function_name, 28) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
std::decay_t<decltype(p17)> _p17_;\
std::decay_t<decltype(p18)> _p18_;\
std::decay_t<decltype(p19)> _p19_;\
std::decay_t<decltype(p20)> _p20_;\
std::decay_t<decltype(p21)> _p21_;\
std::decay_t<decltype(p22)> _p22_;\
std::decay_t<decltype(p23)> _p23_;\
std::decay_t<decltype(p24)> _p24_;\
std::decay_t<decltype(p25)> _p25_;\
std::decay_t<decltype(p26)> _p26_;\
std::decay_t<decltype(p27)> _p27_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
_p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_,\
_p26_, _p27_);\
}\
}
#define make_kernel_functor_hip_29(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\
p24, p25, p26)\
struct make_kernel_name_hip(function_name, 27) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
std::decay_t<decltype(p17)> _p17_;\
std::decay_t<decltype(p18)> _p18_;\
std::decay_t<decltype(p19)> _p19_;\
std::decay_t<decltype(p20)> _p20_;\
std::decay_t<decltype(p21)> _p21_;\
std::decay_t<decltype(p22)> _p22_;\
std::decay_t<decltype(p23)> _p23_;\
std::decay_t<decltype(p24)> _p24_;\
std::decay_t<decltype(p25)> _p25_;\
std::decay_t<decltype(p26)> _p26_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
_p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_,\
_p26_);\
}\
}
#define make_kernel_functor_hip_28(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\
p24, p25)\
struct make_kernel_name_hip(function_name, 26) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
std::decay_t<decltype(p17)> _p17_;\
std::decay_t<decltype(p18)> _p18_;\
std::decay_t<decltype(p19)> _p19_;\
std::decay_t<decltype(p20)> _p20_;\
std::decay_t<decltype(p21)> _p21_;\
std::decay_t<decltype(p22)> _p22_;\
std::decay_t<decltype(p23)> _p23_;\
std::decay_t<decltype(p24)> _p24_;\
std::decay_t<decltype(p25)> _p25_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
_p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_);\
}\
}
#define make_kernel_functor_hip_27(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\
p24)\
struct make_kernel_name_hip(function_name, 25) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
std::decay_t<decltype(p17)> _p17_;\
std::decay_t<decltype(p18)> _p18_;\
std::decay_t<decltype(p19)> _p19_;\
std::decay_t<decltype(p20)> _p20_;\
std::decay_t<decltype(p21)> _p21_;\
std::decay_t<decltype(p22)> _p22_;\
std::decay_t<decltype(p23)> _p23_;\
std::decay_t<decltype(p24)> _p24_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
_p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_);\
}\
}
#define make_kernel_functor_hip_26(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23)\
struct make_kernel_name_hip(function_name, 24) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
std::decay_t<decltype(p17)> _p17_;\
std::decay_t<decltype(p18)> _p18_;\
std::decay_t<decltype(p19)> _p19_;\
std::decay_t<decltype(p20)> _p20_;\
std::decay_t<decltype(p21)> _p21_;\
std::decay_t<decltype(p22)> _p22_;\
std::decay_t<decltype(p23)> _p23_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
_p18_, _p19_, _p20_, _p21_, _p22_, _p23_);\
}\
}
#define make_kernel_functor_hip_25(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22)\
struct make_kernel_name_hip(function_name, 23) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
std::decay_t<decltype(p17)> _p17_;\
std::decay_t<decltype(p18)> _p18_;\
std::decay_t<decltype(p19)> _p19_;\
std::decay_t<decltype(p20)> _p20_;\
std::decay_t<decltype(p21)> _p21_;\
std::decay_t<decltype(p22)> _p22_;\
__attribute__((used, flatten))\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
_p18_, _p19_, _p20_, _p21_, _p22_);\
}\
}
#define make_kernel_functor_hip_24(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21)\
struct make_kernel_name_hip(function_name, 22) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
std::decay_t<decltype(p17)> _p17_;\
std::decay_t<decltype(p18)> _p18_;\
std::decay_t<decltype(p19)> _p19_;\
std::decay_t<decltype(p20)> _p20_;\
std::decay_t<decltype(p21)> _p21_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
_p18_, _p19_, _p20_, _p21_);\
}\
}
#define make_kernel_functor_hip_23(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20)\
struct make_kernel_name_hip(function_name, 21) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
std::decay_t<decltype(p17)> _p17_;\
std::decay_t<decltype(p18)> _p18_;\
std::decay_t<decltype(p19)> _p19_;\
std::decay_t<decltype(p20)> _p20_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
_p18_, _p19_, _p20_);\
}\
}
#define make_kernel_functor_hip_22(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19)\
struct make_kernel_name_hip(function_name, 20) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
std::decay_t<decltype(p17)> _p17_;\
std::decay_t<decltype(p18)> _p18_;\
std::decay_t<decltype(p19)> _p19_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
_p18_, _p19_);\
}\
}
#define make_kernel_functor_hip_21(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16, p17, p18)\
struct make_kernel_name_hip(function_name, 19) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
std::decay_t<decltype(p17)> _p17_;\
std::decay_t<decltype(p18)> _p18_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
_p18_);\
}\
}
#define make_kernel_functor_hip_20(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16, p17)\
struct make_kernel_name_hip(function_name, 18) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
std::decay_t<decltype(p17)> _p17_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_);\
}\
}
#define make_kernel_functor_hip_19(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16)\
struct make_kernel_name_hip(function_name, 17) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_);\
}\
}
#define make_kernel_functor_hip_18(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15)\
struct make_kernel_name_hip(function_name, 16) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_);\
}\
}
#define make_kernel_functor_hip_17(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14)\
struct make_kernel_name_hip(function_name, 15) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_);\
}\
}
#define make_kernel_functor_hip_16(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13)\
struct make_kernel_name_hip(function_name, 14) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_);\
}\
}
#define make_kernel_functor_hip_15(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12)\
struct make_kernel_name_hip(function_name, 13) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_);\
}\
}
#define make_kernel_functor_hip_14(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11)\
struct make_kernel_name_hip(function_name, 12) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_);\
}\
}
#define make_kernel_functor_hip_13(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10)\
struct make_kernel_name_hip(function_name, 11) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_);\
}\
}
#define make_kernel_functor_hip_12(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9)\
struct make_kernel_name_hip(function_name, 10) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\
_p9_);\
}\
}
#define make_kernel_functor_hip_11(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8)\
struct make_kernel_name_hip(function_name, 9) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_);\
}\
}
#define make_kernel_functor_hip_10(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7)\
struct make_kernel_name_hip(function_name, 8) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_);\
}\
}
#define make_kernel_functor_hip_9(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6)\
struct make_kernel_name_hip(function_name, 7) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_);\
}\
}
#define make_kernel_functor_hip_8(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5)\
struct make_kernel_name_hip(function_name, 6) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_);\
}\
}
#define make_kernel_functor_hip_7(\
function_name, kernel_name, p0, p1, p2, p3, p4)\
struct make_kernel_name_hip(function_name, 5) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_);\
}\
}
#define make_kernel_functor_hip_6(function_name, kernel_name, p0, p1, p2, p3)\
struct make_kernel_name_hip(function_name, 4) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(_p0_, _p1_, _p2_, _p3_);\
}\
}
#define make_kernel_functor_hip_5(function_name, kernel_name, p0, p1, p2)\
struct make_kernel_name_hip(function_name, 3) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(_p0_, _p1_, _p2_);\
}\
}
#define make_kernel_functor_hip_4(function_name, kernel_name, p0, p1)\
struct make_kernel_name_hip(function_name, 2) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(_p0_, _p1_);\
}\
}
#define fofo(f, n) kernel_prefix_hip ## f ## kernel_suffix_hip ## n
#define make_kernel_functor_hip_3(function_name, kernel_name, p0)\
struct make_kernel_name_hip(function_name, 1) {\
std::decay_t<decltype(p0)> _p0_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(_p0_);\
}\
}
#define make_kernel_functor_hip_2(function_name, kernel_name)\
struct make_kernel_name_hip(function_name, 0) {\
void operator()(const hc::tiled_index<3>&) [[hc]]\
{\
return kernel_name(hipLaunchParm{});\
}\
}
#define make_kernel_functor_hip_1(...)
#define make_kernel_functor_hip_0(...)
#define make_kernel_functor_hip_(...)\
overload_macro_hip_(make_kernel_functor_hip_, __VA_ARGS__)
#define hipLaunchNamedKernelGGL(\
function_name,\
kernel_name,\
num_blocks,\
dim_blocks,\
group_mem_bytes,\
stream,\
...)\
do {\
make_kernel_functor_hip_(function_name, kernel_name, __VA_ARGS__)\
hip_kernel_functor_impl_{__VA_ARGS__};\
hip_impl::grid_launch_hip_(\
num_blocks,\
dim_blocks,\
group_mem_bytes,\
stream,\
#kernel_name,\
hip_kernel_functor_impl_);\
} while(0)
#define hipLaunchKernelGGL(\
kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)\
do {\
hipLaunchNamedKernelGGL(\
unnamed,\
kernel_name,\
num_blocks,\
dim_blocks,\
group_mem_bytes,\
stream,\
##__VA_ARGS__);\
} while (0)
#define hipLaunchKernel(\
kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)\
do {\
hipLaunchKernelGGL(\
kernel_name,\
num_blocks,\
dim_blocks,\
group_mem_bytes,\
stream,\
hipLaunchParm{},\
##__VA_ARGS__);\
} while(0)
}
#endif //GENERIC_GRID_LAUNCH
#if __hcc_workweek__ >= 17481
#include "functional_grid_launch.hpp"
#else
#include "macro_based_grid_launch.hpp"
#endif
#endif //GENERIC_GRID_LAUNCH
+6 -1
Просмотреть файл
@@ -44,7 +44,12 @@ THE SOFTWARE.
#if GENERIC_GRID_LAUNCH == 0
#define __global__ __attribute__((hc_grid_launch)) __attribute__((used))
#else
#define __global__ __attribute__((annotate("hip__global__"), hc, used, weak))
#if __hcc_workweek__ >= 17481
#define __global__ \
__attribute__((annotate("__HIP_global_function__"), cpu, hc, used))
#else
#define __global__ __attribute__((hc, used))
#endif
#endif //GENERIC_GRID_LAUNCH
#define __noinline__ __attribute__((noinline))
Разница между файлами не показана из-за своего большого размера Загрузить разницу
+84
Просмотреть файл
@@ -0,0 +1,84 @@
/*
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include <hsa/hsa.h>
#include <hsa/hsa_ext_amd.h>
#include <cstddef>
#include <istream>
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
struct ihipModuleSymbol_t;
using hipFunction_t = ihipModuleSymbol_t*;
namespace std
{
template<>
struct hash<hsa_agent_t> {
size_t operator()(hsa_agent_t x) const
{
return hash<decltype(x.handle)>{}(x.handle);
}
};
}
inline
constexpr
bool operator==(hsa_agent_t x, hsa_agent_t y)
{
return x.handle == y.handle;
}
namespace hip_impl
{
struct Kernel_descriptor {
std::uint64_t kernel_object_;
std::uint32_t group_size_;
std::uint32_t private_size_;
std::string name_;
operator hipFunction_t() const
{ // TODO: this is awful and only meant for illustration.
return reinterpret_cast<hipFunction_t>(
const_cast<Kernel_descriptor*>(this));
}
};
using RAII_global = std::unique_ptr<void, decltype(hsa_amd_memory_unlock)*>;
const std::unordered_map<
hsa_agent_t, std::vector<hsa_executable_t>>& executables();
const std::unordered_map<
std::uintptr_t,
std::vector<std::pair<hsa_agent_t, Kernel_descriptor>>>& functions();
const std::unordered_map<std::uintptr_t, std::string>& function_names();
std::unordered_map<std::string, RAII_global>& globals();
hsa_executable_t load_executable(
hsa_executable_t executable, hsa_agent_t agent, std::istream& file);
} // Namespace hip_impl.
+39
Просмотреть файл
@@ -0,0 +1,39 @@
#include "../include/hip/hcc_detail/code_object_bundle.hpp"
#include <hsa/hsa.h>
#include <cstdint>
#include <string>
#include <vector>
hsa_isa_t hip_impl::triple_to_hsa_isa(const std::string& triple)
{
static constexpr const char prefix[] = "hcc-amdgcn--amdhsa-gfx";
static constexpr std::size_t prefix_sz = sizeof(prefix) - 1;
hsa_isa_t r = {};
auto idx = triple.find(prefix);
if (idx != std::string::npos) {
idx += prefix_sz;
std::string tmp = "AMD:AMDGPU";
while (idx != triple.size()) {
tmp.push_back(':');
tmp.push_back(triple[idx++]);
}
hsa_isa_from_name(tmp.c_str(), &r);
}
return r;
}
// DATA - STATICS
constexpr const char hip_impl::Bundled_code_header::magic_string_[];
// CREATORS
hip_impl::Bundled_code_header::Bundled_code_header(
const std::vector<std::uint8_t>& x)
: Bundled_code_header{x.cbegin(), x.cend()}
{}
+57 -41
Просмотреть файл
@@ -28,27 +28,6 @@ extern "C" float __ocml_rint_f32(float);
extern "C" float __ocml_ceil_f32(float);
extern "C" float __ocml_trunc_f32(float);
struct holder64Bit{
union{
double d;
unsigned long int uli;
signed long int sli;
signed int si[2];
unsigned int ui[2];
};
} __attribute__((aligned(8)));
struct holder32Bit {
union {
float f;
unsigned int ui;
signed int si;
};
} __attribute__((aligned(4)));
__device__ struct holder64Bit hold64;
__device__ struct holder32Bit hold32;
__device__ float __double2float_rd(double x)
{
return (double)x;
@@ -69,13 +48,21 @@ __device__ float __double2float_rz(double x)
__device__ int __double2hiint(double x)
{
hold64.d = x;
return hold64.si[1];
static_assert(sizeof(double) == 2 * sizeof(int), "");
int tmp[2];
__builtin_memcpy(tmp, &x, sizeof(tmp));
return tmp[1];
}
__device__ int __double2loint(double x)
{
hold64.d = x;
return hold64.si[0];
static_assert(sizeof(double) == 2 * sizeof(int), "");
int tmp[2];
__builtin_memcpy(tmp, &x, sizeof(tmp));
return tmp[0];
}
@@ -150,8 +137,12 @@ __device__ unsigned long long int __double2ull_rz(double x)
__device__ long long int __double_as_longlong(double x)
{
hold64.d = x;
return hold64.sli;
static_assert(sizeof(long long) == sizeof(double), "");
long long tmp;
__builtin_memcpy(&tmp, &x, sizeof(tmp));
return tmp;
}
__device__ int __float2int_rd(float x)
@@ -224,19 +215,32 @@ __device__ unsigned long long int __float2ull_rz(float x)
__device__ int __float_as_int(float x)
{
hold32.f = x;
return hold32.si;
static_assert(sizeof(int) == sizeof(float), "");
int tmp;
__builtin_memcpy(&tmp, &x, sizeof(tmp));
return tmp;
}
__device__ unsigned int __float_as_uint(float x)
{
hold32.f = x;
return hold32.ui;
static_assert(sizeof(unsigned int) == sizeof(float), "");
unsigned int tmp;
__builtin_memcpy(&tmp, &x, sizeof(tmp));
return tmp;
}
__device__ double __hiloint2double(int hi, int lo)
__device__ double __hiloint2double(int32_t hi, int32_t lo)
{
hold64.si[1] = hi;
hold64.si[0] = lo;
return hold64.d;
static_assert(sizeof(double) == sizeof(uint64_t), "");
uint64_t tmp0 =
(static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
double tmp1;
__builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__ double __int2double_rn(int x)
{
@@ -262,8 +266,12 @@ __device__ float __int2float_rz(int x)
__device__ float __int_as_float(int x)
{
hold32.si = x;
return hold32.f;
static_assert(sizeof(float) == sizeof(int), "");
float tmp;
__builtin_memcpy(&tmp, &x, sizeof(tmp));
return tmp;
}
__device__ double __ll2double_rd(long long int x)
@@ -302,8 +310,12 @@ __device__ float __ll2float_rz(long long int x)
__device__ double __longlong_as_double(long long int x)
{
hold64.sli = x;
return hold64.d;
static_assert(sizeof(double) == sizeof(long long), "");
double tmp;
__builtin_memcpy(&tmp, &x, sizeof(tmp));
return x;
}
__device__ double __uint2double_rn(int x)
@@ -330,8 +342,12 @@ __device__ float __uint2float_rz(unsigned int x)
__device__ float __uint_as_float(unsigned int x)
{
hold32.ui = x;
return hold32.f;
static_assert(sizeof(float) == sizeof(unsigned int), "");
float tmp;
__builtin_memcpy(&tmp, &x, sizeof(tmp));
return tmp;
}
__device__ double __ull2double_rd(unsigned long long int x)
+37 -12
Просмотреть файл
@@ -102,23 +102,48 @@ __device__ void* __hip_hc_free(void *ptr)
// loop unrolling
__device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size)
{
uint8_t *dstPtr, *srcPtr;
dstPtr = (uint8_t*)dst;
srcPtr = (uint8_t*)src;
for(uint32_t i=0;i<size;i++) {
dstPtr[i] = srcPtr[i];
auto dstPtr = static_cast<uint8_t*>(dst);
auto srcPtr = static_cast<const uint8_t*>(src);
while (size >= 4u) {
dstPtr[0] = srcPtr[0];
dstPtr[1] = srcPtr[1];
dstPtr[2] = srcPtr[2];
dstPtr[3] = srcPtr[3];
size -= 4u;
srcPtr += 4u;
dstPtr += 4u;
}
return nullptr;
switch (size) {
case 3: dstPtr[2] = srcPtr[2];
case 2: dstPtr[1] = srcPtr[1];
case 1: dstPtr[0] = srcPtr[0];
}
return dst;
}
__device__ void* __hip_hc_memset(void* ptr, uint8_t val, size_t size)
__device__ void* __hip_hc_memset(void* dst, uint8_t val, size_t size)
{
uint8_t *dstPtr;
dstPtr = (uint8_t*)ptr;
for(uint32_t i=0;i<size;i++) {
dstPtr[i] = val;
auto dstPtr = static_cast<uint8_t*>(dst);
while (size >= 4u) {
dstPtr[0] = val;
dstPtr[1] = val;
dstPtr[2] = val;
dstPtr[3] = val;
size -= 4u;
dstPtr += 4u;
}
return nullptr;
switch (size) {
case 3: dstPtr[2] = val;
case 2: dstPtr[1] = val;
case 1: dstPtr[0] = val;
}
return dst;
}
__device__ float __hip_erfinvf(float x){
+137
Просмотреть файл
@@ -0,0 +1,137 @@
/*
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "hip/hcc_detail/program_state.hpp"
#include "hip/hip_runtime_api.h"
// Internal header, do not percolate upwards.
#include "hip_hcc_internal.h"
#include "hc.hpp"
#include "trace_helper.h"
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <stdexcept>
#include <iostream>
using namespace hc;
using namespace std;
namespace hip_impl
{
namespace
{
inline
string name(uintptr_t function_address)
{
const auto it = function_names().find(function_address);
if (it == function_names().cend()) {
throw runtime_error{
"Invalid function passed to hipLaunchKernelGGL."};
}
return it->second;
}
inline
string name(hsa_agent_t agent)
{
char n[64] = {};
hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, n);
return string{n};
}
inline
hsa_agent_t target_agent(hipStream_t stream)
{
if (stream) {
return *static_cast<hsa_agent_t*>(
stream->locked_getAv()->get_hsa_agent());
}
else if (
ihipGetTlsDefaultCtx() && ihipGetTlsDefaultCtx()->getDevice()) {
return ihipGetDevice(
ihipGetTlsDefaultCtx()->getDevice()->_deviceId)->_hsaAgent;
}
else {
return *static_cast<hsa_agent_t*>(
accelerator{}.get_default_view().get_hsa_agent());
}
}
}
void hipLaunchKernelGGLImpl(
uintptr_t function_address,
const dim3& numBlocks,
const dim3& dimBlocks,
uint32_t sharedMemBytes,
hipStream_t stream,
void** kernarg)
{
const auto it0 = functions().find(function_address);
if (it0 == functions().cend()) {
throw runtime_error{
"No device code available for function: " +
name(function_address)
};
}
auto agent = target_agent(stream);
const auto it1 = find_if(
it0->second.cbegin(),
it0->second.cend(),
[=](const pair<hsa_agent_t, Kernel_descriptor>& x) {
return x.first.handle == agent.handle;
});
if (it1 == it0->second.cend()) {
throw runtime_error{
"No code available for function: " + name(function_address) +
", for agent: " + name(agent)
};
}
for (auto&& agent_kernel : it0->second) {
if (agent.handle == agent_kernel.first.handle) {
hipModuleLaunchKernel(
agent_kernel.second,
numBlocks.x,
numBlocks.y,
numBlocks.z,
dimBlocks.x,
dimBlocks.y,
dimBlocks.z,
sharedMemBytes,
stream,
nullptr,
kernarg);
}
}
}
}
+5 -75
Просмотреть файл
@@ -22,78 +22,8 @@ THE SOFTWARE.
#include "hip/hcc_detail/grid_launch_GGL.hpp"
// Internal header, do not percolate upwards.
#include "hip_hcc_internal.h"
#include "hc.hpp"
#include "trace_helper.h"
#include <iostream>
#include <sstream>
namespace hip_impl
{
hc::accelerator_view lock_stream_hip_(
hipStream_t& stream, void*& locked_stream)
{ // This allocated but does not take ownership of locked_stream. If it is
// not deleted elsewhere it will leak.
using L = decltype(stream->lockopen_preKernelCommand());
HIP_INIT();
stream = ihipSyncAndResolveStream(stream);
locked_stream = new L{stream->lockopen_preKernelCommand()};
return (*static_cast<L*>(locked_stream))->_av;
}
void print_prelaunch_trace_(
const char* kernel_name,
dim3 num_blocks,
dim3 dim_blocks,
int group_mem_bytes,
hipStream_t stream)
{
if ((HIP_TRACE_API & (1 << TRACE_KCMD)) ||
HIP_PROFILE_API ||
(COMPILE_HIP_DB && (HIP_TRACE_API & (1<<TRACE_ALL)))) {
std::stringstream os;
os << tls_tidInfo.tid() << "." << tls_tidInfo.apiSeqNum()
<< " hipLaunchKernel '" << kernel_name << "'"
<< " gridDim:" << num_blocks
<< " groupDim:" << dim_blocks
<< " sharedMem:+" << group_mem_bytes
<< " " << *stream;
if (HIP_PROFILE_API == 0x1) {
std::string shortAtpString("hipLaunchKernel:");
shortAtpString += kernel_name;
MARKER_BEGIN(shortAtpString.c_str(), "HIP");
} else if (HIP_PROFILE_API == 0x2) {
MARKER_BEGIN(os.str().c_str(), "HIP");
}
if (COMPILE_HIP_DB && HIP_TRACE_API) {
std::string fullStr;
recordApiTrace(&fullStr, os.str());
}
}
}
void unlock_stream_hip_(
hipStream_t stream,
void* locked_stream,
const char* kernel_name,
hc::accelerator_view* acc_v)
{ // Precondition: acc_v is the accelerator_view associated with stream
// which is guarded by locked_stream;
// locked_stream is deletable.
using L = decltype(stream->lockopen_preKernelCommand());
stream->lockclose_postKernelCommand(kernel_name, acc_v);
delete static_cast<L*>(locked_stream);
locked_stream = nullptr;
if(HIP_PROFILE_API) {
MARKER_END();
}
}
}
#if __hcc_workweek__ >= 17481
#include "functional_grid_launch.inl"
#else
#include "macro_based_grid_launch.inl"
#endif
+1 -1
Просмотреть файл
@@ -248,7 +248,7 @@ static const DbName dbName [] =
#if COMPILE_HIP_DB
#define tprintf(trace_level, ...) {\
if (HIP_DB & (1<<(trace_level))) {\
char msgStr[1000];\
char msgStr[2000];\
snprintf(msgStr, 2000, __VA_ARGS__);\
fprintf (stderr, " %ship-%s tid:%d:%s%s", dbName[trace_level]._color, dbName[trace_level]._shortName, tls_tidInfo.tid(), msgStr, KNRM); \
}\
+92 -78
Просмотреть файл
@@ -808,6 +808,26 @@ hipError_t hipHostUnregister(void *hostPtr)
return ihipLogStatus(hip_status);
}
namespace
{
inline
hipDeviceptr_t agent_address_for_symbol(const char* symbolName)
{
hipDeviceptr_t r = nullptr;
#if __hcc_workweek__ >= 17481
size_t byte_cnt = 0u;
hipModuleGetGlobal(&r, &byte_cnt, 0, symbolName);
#else
auto ctx = ihipGetTlsDefaultCtx();
auto acc = ctx->getDevice()->_acc;
r = acc.get_symbol_address(symbolName);
#endif
return r;
}
}
hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t count, size_t offset, hipMemcpyKind kind)
{
HIP_INIT_SPECIAL_API((TRACE_MCMD), symbolName, src, count, offset, kind);
@@ -821,7 +841,8 @@ hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t cou
hc::accelerator acc = ctx->getDevice()->_acc;
void *dst = acc.get_symbol_address((const char*) symbolName);
hipDeviceptr_t dst =
agent_address_for_symbol(static_cast<const char*>(symbolName));
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst);
if(dst == nullptr)
@@ -856,7 +877,8 @@ hipError_t hipMemcpyFromSymbol(void* dst, const void* symbolName, size_t count,
hc::accelerator acc = ctx->getDevice()->_acc;
void *src = acc.get_symbol_address((const char*) symbolName);
hipDeviceptr_t src =
agent_address_for_symbol(static_cast<const char*>(symbolName));
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst);
if(dst == nullptr)
@@ -893,7 +915,8 @@ hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void *src, size_
hc::accelerator acc = ctx->getDevice()->_acc;
void *dst = acc.get_symbol_address((const char*) symbolName);
hipDeviceptr_t dst =
agent_address_for_symbol(static_cast<const char*>(symbolName));
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst);
if(dst == nullptr)
@@ -931,7 +954,8 @@ hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName, size_t co
hc::accelerator acc = ctx->getDevice()->_acc;
void *src = acc.get_symbol_address((const char*) symbolName);
hipDeviceptr_t src =
agent_address_for_symbol(static_cast<const char*>(symbolName));
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, src);
if(src == nullptr || dst == nullptr)
@@ -1285,42 +1309,59 @@ hipError_t hipMemcpy3D(const struct hipMemcpy3DParms *p)
return ihipLogStatus(e);
}
// TODO - make member function of stream?
namespace
{
template<
uint32_t block_dim,
typename RandomAccessIterator,
typename N,
typename T>
__global__
void hip_fill_n(RandomAccessIterator f, N n, T value)
{
const uint32_t grid_dim = gridDim.x * blockDim.x;
size_t idx = blockIdx.x * block_dim + threadIdx.x;
while (idx < n) {
__builtin_memcpy(
reinterpret_cast<void*>(&f[idx]),
reinterpret_cast<const void*>(&value),
sizeof(T));
idx += grid_dim;
}
}
template<
typename T,
typename std::enable_if<std::is_integral<T>{}>::type* = nullptr>
inline
const T& clamp_integer(const T& x, const T& lower, const T& upper)
{
assert(!(upper < lower));
return std::min(upper, std::max(x, lower));
}
}
template <typename T>
void
ihipMemsetKernel(hipStream_t stream,
LockedAccessor_StreamCrit_t &crit,
T * ptr, T val, size_t sizeBytes,
hc::completion_future *cf)
T * ptr, T val, size_t sizeBytes)
{
int wg = std::min((unsigned)8, stream->getDevice()->_computeUnits);
const int threads_per_wg = 256;
static constexpr uint32_t block_dim = 256;
int threads = wg * threads_per_wg;
if (threads > sizeBytes) {
threads = ((sizeBytes + threads_per_wg - 1) / threads_per_wg) * threads_per_wg;
}
hc::extent<1> ext(threads);
auto ext_tile = ext.tile(threads_per_wg);
*cf =
hc::parallel_for_each(
crit->_av,
ext_tile,
[=] (hc::tiled_index<1> idx)
__attribute__((hc))
{
int offset = amp_get_global_id(0);
// TODO-HCC - change to hc_get_local_size()
int stride = amp_get_local_size(0) * hc_get_num_groups(0) ;
for (int i=offset; i<sizeBytes; i+=stride) {
ptr[i] = val;
}
});
const uint32_t grid_dim = clamp_integer<size_t>(
sizeBytes / block_dim, 1, UINT32_MAX);
hipLaunchKernelGGL(
hip_fill_n<block_dim>,
dim3(grid_dim),
dim3{block_dim},
0u,
stream,
ptr,
sizeBytes,
std::move(val));
}
@@ -1334,17 +1375,12 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s
stream = ihipSyncAndResolveStream(stream);
if (stream) {
auto crit = stream->lockopen_preKernelCommand();
hc::completion_future cf ;
if ((sizeBytes & 0x3) == 0) {
// use a faster dword-per-workitem copy:
try {
value = value & 0xff;
uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
ihipMemsetKernel<uint32_t> (stream, crit, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t), &cf);
ihipMemsetKernel<uint32_t> (stream, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t));
}
catch (std::exception &ex) {
e = hipErrorInvalidValue;
@@ -1352,19 +1388,16 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s
} else {
// use a slow byte-per-workitem copy:
try {
ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes, &cf);
ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
}
catch (std::exception &ex) {
e = hipErrorInvalidValue;
}
}
stream->lockclose_postKernelCommand("hipMemsetAsync", &crit->_av);
if (HIP_API_BLOCKING) {
tprintf (DB_SYNC, "%s LAUNCH_BLOCKING wait for hipMemsetAsync.\n", ToString(stream).c_str());
cf.wait();
stream->locked_wait();
}
} else {
e = hipErrorInvalidValue;
@@ -1385,16 +1418,12 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes)
stream = ihipSyncAndResolveStream(stream);
if (stream) {
auto crit = stream->lockopen_preKernelCommand();
hc::completion_future cf ;
if ((sizeBytes & 0x3) == 0) {
// use a faster dword-per-workitem copy:
try {
value = value & 0xff;
uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
ihipMemsetKernel<uint32_t> (stream, crit, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t), &cf);
ihipMemsetKernel<uint32_t> (stream, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t));
}
catch (std::exception &ex) {
e = hipErrorInvalidValue;
@@ -1402,21 +1431,18 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes)
} else {
// use a slow byte-per-workitem copy:
try {
ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes, &cf);
ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
}
catch (std::exception &ex) {
e = hipErrorInvalidValue;
}
}
// TODO - is hipMemset supposed to be async?
cf.wait();
stream->lockclose_postKernelCommand("hipMemset", &crit->_av);
stream->locked_wait();
if (HIP_LAUNCH_BLOCKING) {
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset in %s.\n", __func__, ToString(stream).c_str());
cf.wait();
stream->locked_wait();
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed in %s.\n", __func__, ToString(stream).c_str());
}
} else {
@@ -1437,17 +1463,13 @@ hipError_t hipMemset2D(void* dst, size_t pitch, int value, size_t width, size_t
stream = ihipSyncAndResolveStream(stream);
if (stream) {
auto crit = stream->lockopen_preKernelCommand();
hc::completion_future cf ;
size_t sizeBytes = pitch * height;
if ((sizeBytes & 0x3) == 0) {
// use a faster dword-per-workitem copy:
try {
value = value & 0xff;
uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
ihipMemsetKernel<uint32_t> (stream, crit, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t), &cf);
ihipMemsetKernel<uint32_t> (stream, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t));
}
catch (std::exception &ex) {
e = hipErrorInvalidValue;
@@ -1455,20 +1477,18 @@ hipError_t hipMemset2D(void* dst, size_t pitch, int value, size_t width, size_t
} else {
// use a slow byte-per-workitem copy:
try {
ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes, &cf);
ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
}
catch (std::exception &ex) {
e = hipErrorInvalidValue;
}
}
// TODO - is hipMemset supposed to be async?
cf.wait();
stream->lockclose_postKernelCommand("hipMemset", &crit->_av);
stream->locked_wait();
if (HIP_LAUNCH_BLOCKING) {
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset in %s.\n", __func__, ToString(stream).c_str());
cf.wait();
stream->locked_wait();
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed in %s.\n", __func__, ToString(stream).c_str());
}
} else {
@@ -1489,36 +1509,30 @@ hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t sizeByte
stream = ihipSyncAndResolveStream(stream);
if (stream) {
auto crit = stream->lockopen_preKernelCommand();
hc::completion_future cf ;
if ((sizeBytes & 0x3) == 0) {
// use a faster dword-per-workitem copy:
try {
uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
ihipMemsetKernel<uint32_t> (stream, crit, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t), &cf);
ihipMemsetKernel<uint32_t> (stream, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t));
}
catch (std::exception &ex) {
std::cout << ex.what() << std::endl;
e = hipErrorInvalidValue;
}
} else {
// use a slow byte-per-workitem copy:
try {
ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes, &cf);
ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
}
catch (std::exception &ex) {
e = hipErrorInvalidValue;
}
}
cf.wait();
stream->lockclose_postKernelCommand("hipMemsetD8", &crit->_av);
stream->locked_wait();
if (HIP_LAUNCH_BLOCKING) {
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset in %s.\n", __func__, ToString(stream).c_str());
cf.wait();
stream->locked_wait();
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed in %s.\n", __func__, ToString(stream).c_str());
}
} else {
@@ -1717,7 +1731,7 @@ hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* devPtr){
HIP_INIT_API ( handle, devPtr);
hipError_t hipStatus = hipSuccess;
// Get the size of allocated pointer
size_t psize;
size_t psize = 0u;
hc::accelerator acc;
if((handle == NULL) || (devPtr == NULL)) {
hipStatus = hipErrorInvalidResourceHandle;
+95 -217
Просмотреть файл
@@ -37,6 +37,7 @@ THE SOFTWARE.
#include "elfio/elfio.hpp"
#include "hip/hip_runtime.h"
#include "hip/hcc_detail/program_state.hpp"
#include "hip_hcc_internal.h"
#include "trace_helper.h"
@@ -165,159 +166,8 @@ uint64_t ElfSize(const void *emi){
return total_size;
}
namespace
{
template<typename P>
inline
ELFIO::section* find_section_if(ELFIO::elfio& reader, P p)
{
using namespace std;
const auto it = find_if(
reader.sections.begin(), reader.sections.end(), move(p));
return it != reader.sections.end() ? *it : nullptr;
}
inline
std::vector<std::string> copy_names_of_undefined_symbols(
const ELFIO::symbol_section_accessor& section)
{
using namespace ELFIO;
using namespace std;
vector<string> r;
for (auto i = 0u; i != section.get_symbols_num(); ++i) {
// TODO: this is boyscout code, caching the temporaries
// may be of worth.
string name;
Elf64_Addr value = 0;
Elf_Xword size = 0;
Elf_Half sect_idx = 0;
uint8_t bind = 0;
uint8_t type = 0;
uint8_t other = 0;
section.get_symbol(
i, name, value, size, bind, type, sect_idx, other);
if (sect_idx == SHN_UNDEF && !name.empty()) {
r.push_back(std::move(name));
}
}
return r;
}
inline
std::pair<ELFIO::Elf64_Addr, ELFIO::Elf_Xword> find_symbol_address(
const ELFIO::symbol_section_accessor& section,
const std::string& symbol_name)
{
using namespace ELFIO;
using namespace std;
static const pair<Elf64_Addr, Elf_Xword> r{0, 0};
for (auto i = 0u; i != section.get_symbols_num(); ++i) {
// TODO: this is boyscout code, caching the temporaries
// may be of worth.
string name;
Elf64_Addr value = 0;
Elf_Xword size = 0;
Elf_Half sect_idx = 0;
uint8_t bind = 0;
uint8_t type = 0;
uint8_t other = 0;
section.get_symbol(
i, name, value, size, bind, type, sect_idx, other);
if (name == symbol_name) return make_pair(value, size);
}
return r;
}
inline
void associate_code_object_symbols_with_host_allocation(
hipModule_t module,
const ELFIO::elfio& reader,
const ELFIO::elfio& self_reader,
ELFIO::section* code_object_dynsym,
ELFIO::section* process_symtab,
hsa_agent_t agent,
hsa_executable_t executable)
{
using namespace ELFIO;
using namespace std;
if (!code_object_dynsym || !process_symtab) return;
const auto undefined_symbols = copy_names_of_undefined_symbols(
symbol_section_accessor{reader, code_object_dynsym});
for (auto&& x : undefined_symbols) {
const auto tmp = find_symbol_address(
symbol_section_accessor{self_reader, process_symtab}, x);
assert(tmp.first);
void* p = nullptr;
hsa_amd_memory_lock(
reinterpret_cast<void*>(tmp.first), tmp.second, &agent, 1, &p);
hsa_executable_agent_global_variable_define(
executable, agent, x.c_str(), p);
static vector<
unique_ptr<void, decltype(hsa_amd_memory_unlock)*>> globals;
static mutex mtx;
lock_guard<std::mutex> lck{mtx};
globals.emplace_back(p, hsa_amd_memory_unlock);
if (module->coGlobals.count(x) == 0) {
module->coGlobals.emplace(x, tmp.first);
}
}
}
inline
void load_code_object_and_freeze_executable(
const char* file, hsa_agent_t agent, hsa_executable_t executable)
{ // TODO: the following sequence is inefficient, should be refactored
// into a single load of the file and subsequent ELFIO
// processing.
using namespace std;
static const auto cor_deleter = [](hsa_code_object_reader_t* p) {
hsa_code_object_reader_destroy(*p);
};
using RAII_code_reader = unique_ptr<
hsa_code_object_reader_t, decltype(cor_deleter)>;
unique_ptr<FILE, decltype(fclose)*> cobj{fopen(file, "r"), fclose};
RAII_code_reader tmp{new hsa_code_object_reader_t, cor_deleter};
hsa_code_object_reader_create_from_file(fileno(cobj.get()), tmp.get());
hsa_executable_load_agent_code_object(
executable, agent, *tmp, nullptr, nullptr);
hsa_executable_freeze(executable, nullptr);
static vector<RAII_code_reader> code_readers;
static mutex mtx;
lock_guard<mutex> lck{mtx};
code_readers.push_back(move(tmp));
}
}
hipError_t hipModuleLoad(hipModule_t *module, const char *fname)
{
using namespace ELFIO;
HIP_INIT_API(module, fname);
hipError_t ret = hipSuccess;
*module = new ihipModule_t;
@@ -340,37 +190,14 @@ hipError_t hipModuleLoad(hipModule_t *module, const char *fname)
nullptr,
&(*module)->executable);
elfio reader;
if (!reader.load(fname)) {
std::ifstream file{fname};
if (!file.is_open()) {
return ihipLogStatus(hipErrorFileNotFound);
}
else {
// TODO: this may benefit from caching as well.
elfio self_reader;
self_reader.load("/proc/self/exe");
const auto symtab =
find_section_if(self_reader, [](const ELFIO::section* x) {
return x->get_type() == SHT_SYMTAB;
});
const auto code_object_dynsym =
find_section_if(reader, [](const ELFIO::section* x) {
return x->get_type() == SHT_DYNSYM;
});
associate_code_object_symbols_with_host_allocation(
*module,
reader,
self_reader,
code_object_dynsym,
symtab,
currentDevice->_hsaAgent,
(*module)->executable);
load_code_object_and_freeze_executable(
fname, currentDevice->_hsaAgent, (*module)->executable);
}
(*module)->executable = hip_impl::load_executable(
(*module)->executable, currentDevice->_hsaAgent, file);
ret = (*module)->executable.handle ? hipSuccess : hipErrorUnknown;
}
return ihipLogStatus(ret);
@@ -725,16 +552,93 @@ namespace
}
inline
std::vector<Agent_global> read_agent_globals(hipModule_t hmodule)
std::vector<Agent_global> read_agent_globals(
hsa_agent_t agent, hsa_executable_t executable)
{
std::vector<Agent_global> r;
hsa_executable_iterate_agent_symbols(
hmodule->executable, this_agent(), copy_agent_global_variables, &r);
executable, agent, copy_agent_global_variables, &r);
return r;
}
template<typename ForwardIterator>
std::pair<hipDeviceptr_t, std::size_t> read_global_description(
ForwardIterator f, ForwardIterator l, const char* name)
{
const auto it = std::find_if(
f, l, [=](const Agent_global& x) { return x.name == name; });
return it == l ?
std::make_pair(nullptr, 0u) :
std::make_pair(it->address, it->byte_cnt);
}
hipError_t read_agent_global_from_module(
hipDeviceptr_t *dptr,
size_t* bytes,
hipModule_t hmod,
const char* name)
{
static std::unordered_map<
hipModule_t, std::vector<Agent_global>> agent_globals;
// TODO: this is not particularly robust.
if (agent_globals.count(hmod) == 0) {
static std::mutex mtx;
std::lock_guard<std::mutex> lck{mtx};
if (agent_globals.count(hmod) == 0) {
agent_globals.emplace(
hmod, read_agent_globals(this_agent(), hmod->executable));
}
}
// TODO: This is unsafe iff some other emplacement triggers rehashing.
// It will have to be properly fleshed out in the future.
const auto it0 = agent_globals.find(hmod);
if (it0 == agent_globals.cend()) {
throw std::runtime_error{"agent_globals data structure corrupted."};
}
std::tie(*dptr, *bytes) = read_global_description(
it0->second.cbegin(), it0->second.cend(), name);
return dptr ? hipSuccess : hipErrorNotFound;
}
hipError_t read_agent_global_from_process(
hipDeviceptr_t *dptr, size_t* bytes, const char* name)
{
static std::unordered_map<
hsa_agent_t, std::vector<Agent_global>> agent_globals;
static std::once_flag f;
std::call_once(f, []() {
for (auto&& agent_executables : hip_impl::executables()) {
std::vector<Agent_global> tmp0;
for (auto&& executable : agent_executables.second) {
auto tmp1 = read_agent_globals(
agent_executables.first, executable);
tmp0.insert(
tmp0.end(),
std::make_move_iterator(tmp1.begin()),
std::make_move_iterator(tmp1.end()));
}
agent_globals.emplace(agent_executables.first, std::move(tmp0));
}
});
const auto it = agent_globals.find(this_agent());
if (it == agent_globals.cend()) return hipErrorNotInitialized;
std::tie(*dptr, *bytes) = read_global_description(
it->second.cbegin(), it->second.cend(), name);
return dptr ? hipSuccess : hipErrorNotFound;
}
}
hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes,
@@ -745,41 +649,15 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes,
if(dptr == NULL || bytes == NULL){
return ihipLogStatus(hipErrorInvalidValue);
}
if(name == NULL || hmod == NULL){
if(name == NULL){
return ihipLogStatus(hipErrorNotInitialized);
}
else{
static std::unordered_map<
hipModule_t, std::vector<Agent_global>> agent_globals;
ret = hmod ?
read_agent_global_from_module(dptr, bytes, hmod, name) :
read_agent_global_from_process(dptr, bytes, name);
// TODO: this is not particularly robust.
if (agent_globals.count(hmod) == 0) {
static std::mutex mtx;
std::lock_guard<std::mutex> lck{mtx};
if (agent_globals.count(hmod) == 0) {
agent_globals.emplace(hmod, read_agent_globals(hmod));
}
}
// TODO: This is unsafe iff some other emplacement triggers rehashing.
// It will have to be properly fleshed out in the future.
const auto it0 = agent_globals.find(hmod);
if (it0 == agent_globals.cend()) {
throw std::runtime_error{"agent_globals data structure corrupted."};
}
const auto it1 = std::find_if(
it0->second.cbegin(),
it0->second.cend(),
[=](const Agent_global& x) { return x.name == name; });
if (it1 == it0->second.cend()) return ihipLogStatus(hipErrorNotFound);
*dptr = it1->address;
*bytes = it1->byte_cnt;
return ihipLogStatus(hipSuccess);
return ihipLogStatus(ret);
}
}
@@ -848,9 +726,9 @@ hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const
if(name == NULL || hmod == NULL){
ret = hipErrorNotInitialized;
} else{
const auto it = hmod->coGlobals.find(name);
if (it == hmod->coGlobals.end()) return ihipLogStatus(hipErrorInvalidValue);
*texRef = reinterpret_cast<textureReference*>(it->second);
const auto it = hip_impl::globals().find(name);
if (it == hip_impl::globals().end()) return ihipLogStatus(hipErrorInvalidValue);
*texRef = reinterpret_cast<textureReference*>(it->second.get());
ret = hipSuccess;
}
}
+97
Просмотреть файл
@@ -0,0 +1,97 @@
/*
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
// Internal header, do not percolate upwards.
#include "hip_hcc_internal.h"
#include "hc.hpp"
#include "trace_helper.h"
#include <iostream>
#include <sstream>
namespace hip_impl
{
hc::accelerator_view lock_stream_hip_(
hipStream_t& stream, void*& locked_stream)
{ // This allocated but does not take ownership of locked_stream. If it is
// not deleted elsewhere it will leak.
using L = decltype(stream->lockopen_preKernelCommand());
HIP_INIT();
stream = ihipSyncAndResolveStream(stream);
locked_stream = new L{stream->lockopen_preKernelCommand()};
return (*static_cast<L*>(locked_stream))->_av;
}
void print_prelaunch_trace_(
const char* kernel_name,
dim3 num_blocks,
dim3 dim_blocks,
int group_mem_bytes,
hipStream_t stream)
{
if ((HIP_TRACE_API & (1 << TRACE_KCMD)) ||
HIP_PROFILE_API ||
(COMPILE_HIP_DB && (HIP_TRACE_API & (1<<TRACE_ALL)))) {
std::stringstream os;
os << tls_tidInfo.tid() << "." << tls_tidInfo.apiSeqNum()
<< " hipLaunchKernel '" << kernel_name << "'"
<< " gridDim:" << num_blocks
<< " groupDim:" << dim_blocks
<< " sharedMem:+" << group_mem_bytes
<< " " << *stream;
if (HIP_PROFILE_API == 0x1) {
std::string shortAtpString("hipLaunchKernel:");
shortAtpString += kernel_name;
MARKER_BEGIN(shortAtpString.c_str(), "HIP");
} else if (HIP_PROFILE_API == 0x2) {
MARKER_BEGIN(os.str().c_str(), "HIP");
}
if (COMPILE_HIP_DB && HIP_TRACE_API) {
std::string fullStr;
recordApiTrace(&fullStr, os.str());
}
}
}
void unlock_stream_hip_(
hipStream_t stream,
void* locked_stream,
const char* kernel_name,
hc::accelerator_view* acc_v)
{ // Precondition: acc_v is the accelerator_view associated with stream
// which is guarded by locked_stream;
// locked_stream is deletable.
using L = decltype(stream->lockopen_preKernelCommand());
stream->lockclose_postKernelCommand(kernel_name, acc_v);
delete static_cast<L*>(locked_stream);
locked_stream = nullptr;
if(HIP_PROFILE_API) {
MARKER_END();
}
}
}
+567
Просмотреть файл
@@ -0,0 +1,567 @@
#include "../include/hip/hcc_detail/program_state.hpp"
#include "../include/hip/hcc_detail/code_object_bundle.hpp"
#include "hip_hcc_internal.h"
#include "trace_helper.h"
#include "elfio/elfio.hpp"
#include <link.h>
#include <hsa/hsa.h>
#include <hsa/hsa_ext_amd.h>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <memory>
#include <mutex>
#include <sstream>
#include <stdexcept>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
using namespace ELFIO;
using namespace hip_impl;
using namespace std;
namespace std
{
template<>
struct hash<hsa_isa_t> {
size_t operator()(hsa_isa_t x) const
{
return hash<decltype(x.handle)>{}(x.handle);
}
};
}
inline
constexpr
bool operator==(hsa_isa_t x, hsa_isa_t y)
{
return x.handle == y.handle;
}
namespace
{
struct Symbol {
std::string name;
ELFIO::Elf64_Addr value = 0;
ELFIO::Elf_Xword size = 0;
ELFIO::Elf_Half sect_idx = 0;
std::uint8_t bind = 0;
std::uint8_t type = 0;
std::uint8_t other = 0;
};
inline
Symbol read_symbol(const symbol_section_accessor& section, unsigned int idx)
{
assert(idx < section.get_symbols_num());
Symbol r;
section.get_symbol(
idx, r.name, r.value, r.size, r.bind, r.type, r.sect_idx, r.other);
return r;
}
template<typename P>
inline
section* find_section_if(elfio& reader, P p)
{
const auto it = find_if(
reader.sections.begin(), reader.sections.end(), move(p));
return it != reader.sections.end() ? *it : nullptr;
}
vector<string> copy_names_of_undefined_symbols(
const symbol_section_accessor& section)
{
vector<string> r;
for (auto i = 0u; i != section.get_symbols_num(); ++i) {
// TODO: this is boyscout code, caching the temporaries
// may be of worth.
auto tmp = read_symbol(section, i);
if (tmp.sect_idx == SHN_UNDEF && !tmp.name.empty()) {
r.push_back(std::move(tmp.name));
}
}
return r;
}
const std::unordered_map<
std::string,
std::pair<ELFIO::Elf64_Addr, ELFIO::Elf_Xword>>& symbol_addresses()
{
static unordered_map<string, pair<Elf64_Addr, Elf_Xword>> r;
static once_flag f;
call_once(f, []() {
dl_iterate_phdr([](dl_phdr_info* info, size_t, void*) {
static constexpr const char self[] = "/proc/self/exe";
elfio reader;
static unsigned int iter = 0u;
if (reader.load(!iter ? self : info->dlpi_name)) {
auto it = find_section_if(
reader, [](const class section* x) {
return x->get_type() == SHT_SYMTAB;
});
if (it) {
const symbol_section_accessor symtab{reader, it};
for (auto i = 0u; i != symtab.get_symbols_num(); ++i) {
auto tmp = read_symbol(symtab, i);
if (tmp.type == STT_OBJECT &&
tmp.sect_idx != SHN_UNDEF) {
const auto addr =
tmp.value + (iter ? info->dlpi_addr : 0);
r.emplace(
move(tmp.name), make_pair(addr, tmp.size));
}
}
}
++iter;
}
return 0;
}, nullptr);
});
return r;
}
void associate_code_object_symbols_with_host_allocation(
const elfio& reader,
const elfio& self_reader,
section* code_object_dynsym,
section* process_symtab,
hsa_agent_t agent,
hsa_executable_t executable)
{
if (!code_object_dynsym || !process_symtab) return;
const auto undefined_symbols = copy_names_of_undefined_symbols(
symbol_section_accessor{reader, code_object_dynsym});
for (auto&& x : undefined_symbols) {
if (globals().find(x) != globals().cend()) return;
const auto it1 = symbol_addresses().find(x);
if (it1 == symbol_addresses().cend()) {
throw runtime_error{"Global symbol: " + x + " is undefined."};
}
static mutex mtx;
lock_guard<mutex> lck{mtx};
if (globals().find(x) != globals().cend()) return;
void* p = nullptr;
hsa_amd_memory_lock(
reinterpret_cast<void*>(it1->second.first),
it1->second.second,
nullptr, // All agents.
0,
&p);
hsa_executable_agent_global_variable_define(
executable, agent, x.c_str(), p);
globals().emplace(x, RAII_global{p, hsa_amd_memory_unlock});
}
}
vector<uint8_t> code_object_blob_for_process()
{
static constexpr const char self[] = "/proc/self/exe";
static constexpr const char kernel_section[] = ".kernel";
elfio reader;
if (!reader.load(self)) {
throw runtime_error{"Failed to load ELF file for current process."};
}
auto kernels = find_section_if(reader, [](const section* x) {
return x->get_name() == kernel_section;
});
vector<uint8_t> r;
if (kernels) {
r.insert(
r.end(),
kernels->get_data(),
kernels->get_data() + kernels->get_size());
}
return r;
}
const unordered_map<hsa_isa_t, vector<vector<uint8_t>>>& code_object_blobs()
{
static unordered_map<hsa_isa_t, vector<vector<uint8_t>>> r;
static once_flag f;
call_once(f, []() {
static vector<vector<uint8_t>> blobs{
code_object_blob_for_process()};
dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void*) {
elfio tmp;
if (tmp.load(info->dlpi_name)) {
const auto it = find_section_if(tmp, [](const section* x) {
return x->get_name() == ".kernel";
});
if (it) blobs.emplace_back(
it->get_data(), it->get_data() + it->get_size());
}
return 0;
}, nullptr);
for (auto&& blob : blobs) {
Bundled_code_header tmp{blob};
if (valid(tmp)) {
for (auto&& bundle : bundles(tmp)) {
r[triple_to_hsa_isa(bundle.triple)].push_back(
bundle.blob);
}
}
}
});
return r;
}
vector<pair<uintptr_t, string>> function_names_for(
const elfio& reader, section* symtab)
{
vector<pair<uintptr_t, string>> r;
symbol_section_accessor symbols{reader, symtab};
for (auto i = 0u; i != symbols.get_symbols_num(); ++i) {
// TODO: this is boyscout code, caching the temporaries
// may be of worth.
auto tmp = read_symbol(symbols, i);
if (tmp.type == STT_FUNC &&
tmp.sect_idx != SHN_UNDEF &&
!tmp.name.empty()) {
r.emplace_back(tmp.value, tmp.name);
}
}
return r;
}
const vector<pair<uintptr_t, string>>& function_names_for_process()
{
static constexpr const char self[] = "/proc/self/exe";
static vector<pair<uintptr_t, string>> r;
static once_flag f;
call_once(f, []() {
elfio reader;
if (!reader.load(self)) {
throw runtime_error{
"Failed to load the ELF file for the current process."};
}
auto symtab = find_section_if(reader, [](const section* x) {
return x->get_type() == SHT_SYMTAB;
});
if (symtab) r = function_names_for(reader, symtab);
});
return r;
}
inline
hsa_agent_t agent(hsa_executable_symbol_t x)
{
hsa_agent_t r = {};
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_AGENT, &r);
return r;
}
inline
uint32_t group_size(hsa_executable_symbol_t x)
{
uint32_t r = 0u;
hsa_executable_symbol_get_info(
x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &r);
return r;
}
inline
uint64_t kernel_object(hsa_executable_symbol_t x)
{
uint64_t r = 0u;
hsa_executable_symbol_get_info(
x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &r);
return r;
}
inline
string name(hsa_executable_symbol_t x)
{
uint32_t sz = 0u;
hsa_executable_symbol_get_info(
x, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &sz);
string r(sz, '\0');
hsa_executable_symbol_get_info(
x, HSA_EXECUTABLE_SYMBOL_INFO_NAME, &r.front());
return r;
}
inline
uint32_t private_size(hsa_executable_symbol_t x)
{
uint32_t r = 0u;
hsa_executable_symbol_get_info(
x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &r);
return r;
}
inline
hsa_symbol_kind_t type(hsa_executable_symbol_t x)
{
hsa_symbol_kind_t r = {};
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &r);
return r;
}
const unordered_map<string, vector<hsa_executable_symbol_t>>& kernels()
{
static unordered_map<string, vector<hsa_executable_symbol_t>> r;
static once_flag f;
call_once(f, []() {
static const auto copy_kernels = [](
hsa_executable_t, hsa_agent_t, hsa_executable_symbol_t s, void*) {
if (type(s) == HSA_SYMBOL_KIND_KERNEL) r[name(s)].push_back(s);
return HSA_STATUS_SUCCESS;
};
for (auto&& agent_executables : executables()) {
for (auto&& executable : agent_executables.second) {
hsa_executable_iterate_agent_symbols(
executable,
agent_executables.first,
copy_kernels,
nullptr);
}
}
});
return r;
}
void load_code_object_and_freeze_executable(
istream& file, hsa_agent_t agent, hsa_executable_t executable)
{ // TODO: the following sequence is inefficient, should be refactored
// into a single load of the file and subsequent ELFIO
// processing.
static const auto cor_deleter = [](hsa_code_object_reader_t* p) {
hsa_code_object_reader_destroy(*p);
};
using RAII_code_reader = unique_ptr<
hsa_code_object_reader_t, decltype(cor_deleter)>;
file.seekg(0);
vector<uint8_t> blob{
istreambuf_iterator<char>{file}, istreambuf_iterator<char>{}};
RAII_code_reader tmp{new hsa_code_object_reader_t, cor_deleter};
hsa_code_object_reader_create_from_memory(
blob.data(), blob.size(), tmp.get());
hsa_executable_load_agent_code_object(
executable, agent, *tmp, nullptr, nullptr);
hsa_executable_freeze(executable, nullptr);
static vector<RAII_code_reader> code_readers;
static mutex mtx;
lock_guard<mutex> lck{mtx};
code_readers.push_back(move(tmp));
}
}
namespace hip_impl
{
const unordered_map<hsa_agent_t, vector<hsa_executable_t>>& executables()
{
static unordered_map<hsa_agent_t, vector<hsa_executable_t>> r;
static once_flag f;
call_once(f, []() {
static const auto accelerators = hc::accelerator::get_all();
for (auto&& acc : accelerators) {
auto agent = static_cast<hsa_agent_t*>(acc.get_hsa_agent());
if (!agent || !acc.is_hsa_accelerator()) continue;
hsa_agent_iterate_isas(*agent, [](hsa_isa_t x, void* pa) {
const auto it = code_object_blobs().find(x);
if (it != code_object_blobs().cend()) {
hsa_agent_t a = *static_cast<hsa_agent_t*>(pa);
for (auto&& blob : it->second) {
hsa_executable_t tmp = {};
hsa_executable_create_alt(
HSA_PROFILE_FULL,
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
nullptr,
&tmp);
// TODO: this is massively inefficient and only
// meant for illustration.
string blob_to_str{blob.cbegin(), blob.cend()};
stringstream istr{blob_to_str};
tmp = load_executable(tmp, a, istr);
if (tmp.handle) r[a].push_back(tmp);
}
}
return HSA_STATUS_SUCCESS;
}, agent);
}
});
return r;
}
const unordered_map<uintptr_t, string>& function_names()
{
static unordered_map<uintptr_t, string> r{
function_names_for_process().cbegin(),
function_names_for_process().cend()};
static once_flag f;
call_once(f, []() {
dl_iterate_phdr([](dl_phdr_info* info, size_t, void*) {
elfio tmp;
if (tmp.load(info->dlpi_name)) {
const auto it = find_section_if(tmp, [](const section* x) {
return x->get_type() == SHT_SYMTAB;
});
if (it) {
auto n = function_names_for(tmp, it);
for (auto&& f : n) f.first += info->dlpi_addr;
r.insert(
make_move_iterator(n.begin()),
make_move_iterator(n.end()));
}
}
return 0;
}, nullptr);
});
return r;
}
const unordered_map<
uintptr_t, vector<pair<hsa_agent_t, Kernel_descriptor>>>& functions()
{
static unordered_map<
uintptr_t, vector<pair<hsa_agent_t, Kernel_descriptor>>> r;
static once_flag f;
call_once(f, []() {
for (auto&& function : function_names()) {
const auto it = kernels().find(function.second);
if (it != kernels().cend()) {
for (auto&& kernel_symbol : it->second) {
r[function.first].emplace_back(
agent(kernel_symbol),
Kernel_descriptor{
kernel_object(kernel_symbol),
group_size(kernel_symbol),
private_size(kernel_symbol),
it->first});
}
}
}
});
return r;
}
unordered_map<string, RAII_global>& globals()
{
static unordered_map<string, RAII_global> r;
static once_flag f;
call_once(f, []() { r.reserve(symbol_addresses().size()); });
return r;
}
hsa_executable_t load_executable(
hsa_executable_t executable, hsa_agent_t agent, istream& file)
{
elfio reader;
if (!reader.load(file)) {
return hsa_executable_t{};
}
else {
// TODO: this may benefit from caching as well.
elfio self_reader;
self_reader.load("/proc/self/exe");
const auto symtab =
find_section_if(self_reader, [](const ELFIO::section* x) {
return x->get_type() == SHT_SYMTAB;
});
const auto code_object_dynsym =
find_section_if(reader, [](const ELFIO::section* x) {
return x->get_type() == SHT_DYNSYM;
});
associate_code_object_symbols_with_host_allocation(
reader, self_reader, code_object_dynsym, symtab, agent, executable);
load_code_object_and_freeze_executable(file, agent, executable);
return executable;
}
}
} // Namespace hip_impl.
+1 -1
Просмотреть файл
@@ -23,7 +23,7 @@ __global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In)
__global__ void set(hipLaunchParm lp, uint32_t *ptr, uint8_t val, size_t size)
{
int tx = threadIdx.x;
memset(ptr + tx, val, (sizeof(uint32_t)*(size/LEN)));
memset(ptr + tx, val, sizeof(uint32_t));
}
int main()
+30 -15
Просмотреть файл
@@ -159,11 +159,16 @@ bool dataTypesRun(){
HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice));
hipLaunchKernel(vectoradd_float,
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0, 0,
deviceA ,deviceB ,WIDTH ,HEIGHT);
hipLaunchKernel(
vectoradd_float,
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0,
0,
deviceA,
static_cast<const T*>(deviceB),
WIDTH,
HEIGHT);
HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(T), hipMemcpyDeviceToHost));
@@ -221,11 +226,16 @@ bool dataTypesRun2(){
HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T)));
HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice));
hipLaunchKernel(vectoradd_float,
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0, 0,
deviceA ,deviceB,WIDTH ,HEIGHT);
hipLaunchKernel(
vectoradd_float,
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0,
0,
deviceA,
static_cast<const T*>(deviceB),
WIDTH,
HEIGHT);
HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(T), hipMemcpyDeviceToHost));
@@ -281,11 +291,16 @@ bool dataTypesRun4(){
HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice));
hipLaunchKernel(vectoradd_float,
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0, 0,
deviceA ,deviceB ,WIDTH ,HEIGHT);
hipLaunchKernel(
vectoradd_float,
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0,
0,
deviceA,
static_cast<const T*>(deviceB),
WIDTH,
HEIGHT);
HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(T), hipMemcpyDeviceToHost));
+10 -1
Просмотреть файл
@@ -69,7 +69,16 @@ int main(int argc, char *argv[])
// Record the start event
HIPCHECK (hipEventRecord(start, NULL));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N);
hipLaunchKernel(
HipTest::vectorADD,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const float*>(A_d),
static_cast<const float*>(B_d),
C_d,
N);
HIPCHECK (hipEventRecord(stop, NULL));
+10 -1
Просмотреть файл
@@ -77,7 +77,16 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_
HIPCHECK(hipEventRecord(timingDisabled, stream));
// sandwhich a kernel:
HIPCHECK(hipEventRecord(start, stream));
hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, stream, C_d, C_h, numElements, count);
hipLaunchKernelGGL(
HipTest::addCountReverse,
dim3(blocks),
dim3(threadsPerBlock),
0,
stream,
static_cast<const int*>(C_d),
C_h,
numElements,
count);
HIPCHECK(hipEventRecord(stop, stream));
+10 -1
Просмотреть файл
@@ -243,7 +243,16 @@ void memcpytest2(DeviceMemory<T> *dmem, HostMemory<T> *hmem, size_t numElements,
HIPCHECK ( hipMemcpy(dmem->B_d(), hmem->B_h(), sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
}
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, dmem->A_d(), dmem->B_d(), dmem->C_d(), numElements);
hipLaunchKernel(
HipTest::vectorADD,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const T*>(dmem->A_d()),
static_cast<const T*>(dmem->B_d()),
dmem->C_d(),
numElements);
if (useDeviceToDevice) {
// Do an extra device-to-device copy here to mix things up:
+26 -8
Просмотреть файл
@@ -49,21 +49,39 @@ int main()
HIPCHECK(hipMalloc(&Y_d,Nbytes));
HIPCHECK(hipMalloc(&Z_d,Nbytes));
HIPCHECK(hipSetDevice(0));
HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N);
hipLaunchKernel(
HipTest::vectorADD,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const int*>(A_d),
static_cast<const int*>(B_d),
C_d,
N);
HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
HIPCHECK(hipDeviceSynchronize());
HipTest::checkVectorADD(A_h, B_h, C_h, N);
HIPCHECK(hipSetDevice(1));
HIPCHECK(hipMemcpyDtoD(X_d, A_d, Nbytes));
HIPCHECK(hipMemcpyDtoD(X_d, A_d, Nbytes));
HIPCHECK(hipMemcpyDtoD(Y_d, B_d, Nbytes));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N);
hipLaunchKernel(
HipTest::vectorADD,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const int*>(X_d),
static_cast<const int*>(Y_d),
Z_d,
N);
HIPCHECK(hipMemcpyDtoH(C_h, Z_d, Nbytes));
HIPCHECK(hipDeviceSynchronize());
HipTest::checkVectorADD(A_h, B_h, C_h, N);
@@ -73,8 +91,8 @@ int main()
HIPCHECK(hipFree(Y_d));
HIPCHECK(hipFree(Z_d));
}
passed();
}
+26 -8
Просмотреть файл
@@ -50,25 +50,43 @@ int main()
HIPCHECK(hipMalloc(&Y_d,Nbytes));
HIPCHECK(hipMalloc(&Z_d,Nbytes));
HIPCHECK(hipSetDevice(0));
HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N);
hipLaunchKernel(
HipTest::vectorADD,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const int*>(A_d),
static_cast<const int*>(B_d),
C_d,
N);
HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
HIPCHECK(hipDeviceSynchronize());
HipTest::checkVectorADD(A_h, B_h, C_h, N);
HIPCHECK(hipStreamCreate(&s));
HIPCHECK(hipStreamCreate(&s));
HIPCHECK(hipSetDevice(1));
HIPCHECK(hipMemcpyDtoDAsync(X_d, A_d, Nbytes, s));
HIPCHECK(hipMemcpyDtoDAsync(X_d, A_d, Nbytes, s));
HIPCHECK(hipMemcpyDtoDAsync(Y_d, B_d, Nbytes, s));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N);
hipLaunchKernel(
HipTest::vectorADD,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const int*>(X_d),
static_cast<const int*>(Y_d),
Z_d,
N);
HIPCHECK(hipMemcpyDtoHAsync(C_h, Z_d, Nbytes, s));
HIPCHECK(hipStreamSynchronize(s));
HIPCHECK(hipDeviceSynchronize());
HipTest::checkVectorADD(A_h, B_h, C_h, N);
HIPCHECK(hipStreamDestroy(s));
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
@@ -78,7 +96,7 @@ int main()
}
passed();
}
+24 -6
Просмотреть файл
@@ -48,24 +48,42 @@ int main()
HIPCHECK(hipMalloc(&X_d,Nbytes));
HIPCHECK(hipMalloc(&Y_d,Nbytes));
HIPCHECK(hipMalloc(&Z_d,Nbytes));
HIPCHECK(hipSetDevice(0));
HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N);
hipLaunchKernel(
HipTest::vectorADD,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const int*>(A_d),
static_cast<const int*>(B_d),
C_d,
N);
HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
HIPCHECK(hipDeviceSynchronize());
HipTest::checkVectorADD(A_h, B_h, C_h, N);
HIPCHECK(hipSetDevice(1));
hipMemcpyPeer(X_d, 1, A_d, 0, Nbytes); //this call is eqv to hipMemcpy(hipMemcpyD2D) which goes via stg bufs.
hipMemcpyPeer(Y_d, 1, B_d, 0, Nbytes);
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N);
hipLaunchKernel(
HipTest::vectorADD,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const int*>(X_d),
static_cast<const int*>(Y_d),
Z_d,
N);
HIPCHECK(hipMemcpy(C_h, Z_d, Nbytes, hipMemcpyDeviceToHost));
HIPCHECK(hipDeviceSynchronize());
HipTest::checkVectorADD(A_h, B_h, C_h, N);
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
HIPCHECK(hipFree(X_d));
HIPCHECK(hipFree(Y_d));
@@ -74,7 +92,7 @@ int main()
passed();
}
+25 -7
Просмотреть файл
@@ -51,26 +51,44 @@ int main()
HIPCHECK(hipMalloc(&Y_d,Nbytes));
HIPCHECK(hipMalloc(&Z_d,Nbytes));
HIPCHECK(hipSetDevice(0));
HIPCHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
HIPCHECK ( hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N);
hipLaunchKernel(
HipTest::vectorADD,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const int*>(A_d),
static_cast<const int*>(B_d),
C_d,
N);
HIPCHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
HIPCHECK (hipDeviceSynchronize());
HipTest::checkVectorADD(A_h, B_h, C_h, N);
HIPCHECK(hipStreamCreate(&s));
HIPCHECK(hipStreamCreate(&s));
HIPCHECK(hipSetDevice(1));
HIPCHECK(hipMemcpyPeerAsync(X_d, 1, A_d, 0, Nbytes, s));
HIPCHECK(hipMemcpyPeerAsync(Y_d, 1, B_d, 0, Nbytes, s));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N);
hipLaunchKernel(
HipTest::vectorADD,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const int*>(X_d),
static_cast<const int*>(Y_d),
Z_d,
N);
HIPCHECK ( hipMemcpy(C_h, Z_d, Nbytes, hipMemcpyDeviceToHost));
HIPCHECK (hipDeviceSynchronize());
HIPCHECK (hipStreamSynchronize(s));
HipTest::checkVectorADD(A_h, B_h, C_h, N);
HIPCHECK(hipStreamDestroy(s));
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
HIPCHECK(hipFree(X_d));
@@ -79,7 +97,7 @@ int main()
}
passed();
}
+10 -1
Просмотреть файл
@@ -63,7 +63,16 @@ void simpleTest1()
HIPCHECK ( memcopy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
HIPCHECK ( memcopy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N);
hipLaunchKernel(
HipTest::vectorADD,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const int*>(A_d),
static_cast<const int*>(B_d),
C_d,
N);
HIPCHECK ( memcopy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
+10 -1
Просмотреть файл
@@ -90,7 +90,16 @@ void simpleVectorAdd(size_t numElements, int iters, hipStream_t stream)
// This is the null stream?
//hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements);
hipLaunchKernel(HipTest::vectorADDReverse, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements);
hipLaunchKernel(
HipTest::vectorADDReverse,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const T*>(A_d),
static_cast<const T*>(B_d),
C_d,
numElements);
MemTraits<C>::Copy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, stream);
+33 -3
Просмотреть файл
@@ -128,7 +128,17 @@ void Streamer<T>::enqueAsync()
{
printf ("testing: %s numElements=%zu size=%6.2fMB\n", __func__, _numElements, _numElements * sizeof(T) / 1024.0/1024.0);
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements);
hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _B_d, _C_d, _numElements, p_repeat);
hipLaunchKernel(
vectorADDRepeat,
dim3(blocks),
dim3(threadsPerBlock),
0,
_stream,
static_cast<const T*>(_A_d),
static_cast<const T*>(_B_d),
_C_d,
_numElements,
p_repeat);
}
@@ -225,7 +235,17 @@ int main(int argc, char *argv[])
auto lastStreamer = streamers[s - 1];
// Dispatch to NULL stream, should wait for prior async activity to complete before beginning:
hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/);
hipLaunchKernel(
vectorADDRepeat,
dim3(blocks),
dim3(threadsPerBlock),
0,
0/*nullstream*/,
static_cast<const int*>(lastStreamer->_C_d),
static_cast<const int*>(lastStreamer->_C_d),
nullStreamer->_C_d,
numElements,
1/*repeat*/);
if (p_db) {
@@ -257,7 +277,17 @@ int main(int argc, char *argv[])
auto lastStreamer = streamers[s - 1];
// Dispatch to NULL stream, should wait for prior async activity to complete before beginning:
hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/);
hipLaunchKernel(
vectorADDRepeat,
dim3(blocks),
dim3(threadsPerBlock),
0,
0/*nullstream*/,
static_cast<const int*>(lastStreamer->_C_d),
static_cast<const int*>(lastStreamer->_C_d),
nullStreamer->_C_d,
numElements,
1/*repeat*/);
nullStreamer->D2H();
+10 -1
Просмотреть файл
@@ -97,7 +97,16 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode s
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);
// Launch kernel into null stream, should result in C_h == count.
hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, 0 /*stream*/, C_d, C_h, numElements, count);
hipLaunchKernelGGL(
HipTest::addCountReverse,
dim3(blocks),
dim3(threadsPerBlock),
0,
0 /*stream*/,
static_cast<const int*>(C_d),
C_h,
numElements,
count);
HIPCHECK(hipEventRecord(stop, 0/*default*/));
switch (syncMode) {
+20 -2
Просмотреть файл
@@ -163,9 +163,27 @@ void Streamer<T>::runAsyncAfter(Streamer<T> *depStreamer, bool waitSameStream)
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements);
if (_commandType == COMMAND_ADD_REVERSE) {
hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count);
hipLaunchKernelGGL(
HipTest::addCountReverse,
dim3(blocks),
dim3(threadsPerBlock),
0,
_stream,
static_cast<const T*>(_A_d),
_C_d,
static_cast<int64_t>(_numElements),
static_cast<int>(p_count));
} else if (_commandType == COMMAND_ADD_FORWARD) {
hipLaunchKernelGGL(HipTest::addCount, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count);
hipLaunchKernelGGL(
HipTest::addCount,
dim3(blocks),
dim3(threadsPerBlock),
0,
_stream,
static_cast<const T*>(_A_d),
_C_d,
_numElements,
static_cast<int>(p_count));
} else if (_commandType == COMMAND_COPY) {
HIPCHECK(hipMemcpyAsync(_C_d, _A_d, _numElements * sizeof(T), hipMemcpyDeviceToDevice, _stream));
} else {