Merge pull request #246 from AlexVlx/feature_use_module_based_dispatch_instead_of_pfe
Use natural dispatch syntax
Этот коммит содержится в:
@@ -167,6 +167,7 @@ if(HIP_PLATFORM STREQUAL "hcc")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${HIP_HCC_BUILD_FLAGS}")
|
||||
|
||||
set(SOURCE_FILES_RUNTIME
|
||||
src/code_object_bundle.cpp
|
||||
src/hip_hcc.cpp
|
||||
src/hip_context.cpp
|
||||
src/hip_device.cpp
|
||||
@@ -179,7 +180,8 @@ if(HIP_PLATFORM STREQUAL "hcc")
|
||||
src/hip_db.cpp
|
||||
src/grid_launch.cpp
|
||||
src/hip_texture.cpp
|
||||
src/env.cpp)
|
||||
src/env.cpp
|
||||
src/program_state.cpp)
|
||||
|
||||
set(SOURCE_FILES_DEVICE
|
||||
src/device_util.cpp
|
||||
|
||||
@@ -0,0 +1,158 @@
|
||||
/*
|
||||
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <hsa/hsa.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <istream>
|
||||
#include <iterator>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace hip_impl
|
||||
{
|
||||
hsa_isa_t triple_to_hsa_isa(const std::string& triple);
|
||||
|
||||
struct Bundled_code {
|
||||
union {
|
||||
struct {
|
||||
std::uint64_t offset;
|
||||
std::uint64_t bundle_sz;
|
||||
std::uint64_t triple_sz;
|
||||
};
|
||||
std::uint8_t cbuf[
|
||||
sizeof(offset) + sizeof(bundle_sz) + sizeof(triple_sz)];
|
||||
};
|
||||
std::string triple;
|
||||
std::vector<std::uint8_t> blob;
|
||||
};
|
||||
|
||||
class Bundled_code_header {
|
||||
// DATA - STATICS
|
||||
static constexpr const char magic_string_[] =
|
||||
"__CLANG_OFFLOAD_BUNDLE__";
|
||||
static constexpr auto magic_string_sz_ = sizeof(magic_string_) - 1;
|
||||
|
||||
// DATA
|
||||
union {
|
||||
struct {
|
||||
std::uint8_t bundler_magic_string_[magic_string_sz_];
|
||||
std::uint64_t bundle_cnt_;
|
||||
};
|
||||
std::uint8_t cbuf_[
|
||||
sizeof(bundler_magic_string_) + sizeof(bundle_cnt_)];
|
||||
};
|
||||
std::vector<Bundled_code> bundles_;
|
||||
|
||||
// FRIENDS - MANIPULATORS
|
||||
template<typename RandomAccessIterator>
|
||||
friend
|
||||
inline
|
||||
bool read(
|
||||
RandomAccessIterator f,
|
||||
RandomAccessIterator l,
|
||||
Bundled_code_header& x)
|
||||
{
|
||||
if (f == l) return false;
|
||||
|
||||
std::copy_n(f, sizeof(x.cbuf_), x.cbuf_);
|
||||
|
||||
if (valid(x)) {
|
||||
x.bundles_.resize(x.bundle_cnt_);
|
||||
|
||||
auto it = f + sizeof(x.cbuf_);
|
||||
for (auto&& y : x.bundles_) {
|
||||
std::copy_n(it, sizeof(y.cbuf), y.cbuf);
|
||||
it += sizeof(y.cbuf);
|
||||
|
||||
y.triple.insert(y.triple.cend(), it, it + y.triple_sz);
|
||||
|
||||
std::copy_n(
|
||||
f + y.offset, y.bundle_sz, std::back_inserter(y.blob));
|
||||
|
||||
it += y.triple_sz;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
friend
|
||||
inline
|
||||
bool read(const std::vector<std::uint8_t>& blob, Bundled_code_header& x)
|
||||
{
|
||||
return read(blob.cbegin(), blob.cend(), x);
|
||||
}
|
||||
friend
|
||||
inline
|
||||
bool read(std::istream& is, Bundled_code_header& x)
|
||||
{
|
||||
return read(std::vector<std::uint8_t>{
|
||||
std::istreambuf_iterator<char>{is},
|
||||
std::istreambuf_iterator<char>{}},
|
||||
x);
|
||||
}
|
||||
|
||||
// FRIENDS - ACCESSORS
|
||||
friend
|
||||
inline
|
||||
bool valid(const Bundled_code_header& x)
|
||||
{
|
||||
return std::equal(
|
||||
x.bundler_magic_string_,
|
||||
x.bundler_magic_string_ + magic_string_sz_,
|
||||
x.magic_string_);
|
||||
}
|
||||
friend
|
||||
inline
|
||||
const std::vector<Bundled_code>& bundles(const Bundled_code_header& x)
|
||||
{
|
||||
return x.bundles_;
|
||||
}
|
||||
public:
|
||||
// CREATORS
|
||||
Bundled_code_header() = default;
|
||||
template<typename RandomAccessIterator>
|
||||
Bundled_code_header(RandomAccessIterator f, RandomAccessIterator l);
|
||||
explicit
|
||||
Bundled_code_header(const std::vector<std::uint8_t>& blob);
|
||||
Bundled_code_header(const Bundled_code_header&) = default;
|
||||
Bundled_code_header(Bundled_code_header&&) = default;
|
||||
~Bundled_code_header() = default;
|
||||
|
||||
// MANIPULATORS
|
||||
Bundled_code_header& operator=(const Bundled_code_header&) = default;
|
||||
Bundled_code_header& operator=(Bundled_code_header&&) = default;
|
||||
};
|
||||
|
||||
// CREATORS
|
||||
template<typename I>
|
||||
Bundled_code_header::Bundled_code_header(I f, I l) : Bundled_code_header{}
|
||||
{
|
||||
read(f, l, *this);
|
||||
}
|
||||
} // Namespace hip_impl.
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
||||
@@ -0,0 +1,159 @@
|
||||
/*
|
||||
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "code_object_bundle.hpp"
|
||||
#include "concepts.hpp"
|
||||
#include "helpers.hpp"
|
||||
#include "program_state.hpp"
|
||||
|
||||
#include "hc.hpp"
|
||||
#include "hip/hip_hcc.h"
|
||||
#include "hip_runtime.h"
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <mutex>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <tuple>
|
||||
#include <type_traits>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace hip_impl
|
||||
{
|
||||
template<
|
||||
typename T,
|
||||
typename std::enable_if<std::is_integral<T>{}>::type* = nullptr>
|
||||
inline
|
||||
T round_up_to_next_multiple_nonnegative(T x, T y)
|
||||
{
|
||||
T tmp = x + y - 1;
|
||||
return tmp - tmp % y;
|
||||
}
|
||||
|
||||
inline
|
||||
std::vector<std::uint8_t> make_kernarg()
|
||||
{
|
||||
return {};
|
||||
}
|
||||
|
||||
inline
|
||||
std::vector<std::uint8_t> make_kernarg(std::vector<std::uint8_t> kernarg)
|
||||
{
|
||||
return kernarg;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
inline
|
||||
std::vector<std::uint8_t> make_kernarg(std::vector<uint8_t> kernarg, T x)
|
||||
{
|
||||
kernarg.resize(
|
||||
round_up_to_next_multiple_nonnegative(kernarg.size(), alignof(T)) +
|
||||
sizeof(T));
|
||||
|
||||
new (kernarg.data() + kernarg.size() - sizeof(T)) T{std::move(x)};
|
||||
|
||||
return kernarg;
|
||||
}
|
||||
|
||||
template<typename T, typename... Ts>
|
||||
inline
|
||||
std::vector<std::uint8_t> make_kernarg(
|
||||
std::vector<std::uint8_t> kernarg, T x, Ts... xs)
|
||||
{
|
||||
return make_kernarg(
|
||||
make_kernarg(std::move(kernarg), std::move(x)), std::move(xs)...);
|
||||
}
|
||||
|
||||
template<typename... Ts>
|
||||
inline
|
||||
std::vector<std::uint8_t> make_kernarg(Ts... xs)
|
||||
{
|
||||
std::vector<std::uint8_t> kernarg;
|
||||
kernarg.reserve(sizeof(std::tuple<Ts...>));
|
||||
|
||||
return make_kernarg(std::move(kernarg), std::move(xs)...);
|
||||
}
|
||||
|
||||
void hipLaunchKernelGGLImpl(
|
||||
std::uintptr_t function_address,
|
||||
const dim3& numBlocks,
|
||||
const dim3& dimBlocks,
|
||||
std::uint32_t sharedMemBytes,
|
||||
hipStream_t stream,
|
||||
void** kernarg);
|
||||
} // Namespace hip_impl.
|
||||
|
||||
template<typename... Args, typename F = void (*)(Args...)>
|
||||
inline
|
||||
void hipLaunchKernelGGL(
|
||||
F kernel,
|
||||
const dim3& numBlocks,
|
||||
const dim3& dimBlocks,
|
||||
std::uint32_t sharedMemBytes,
|
||||
hipStream_t stream,
|
||||
Args... args)
|
||||
{
|
||||
auto kernarg = hip_impl::make_kernarg(std::move(args)...);
|
||||
std::size_t kernarg_size = kernarg.size();
|
||||
|
||||
void* config[] = {
|
||||
HIP_LAUNCH_PARAM_BUFFER_POINTER, kernarg.data(),
|
||||
HIP_LAUNCH_PARAM_BUFFER_SIZE, &kernarg_size,
|
||||
HIP_LAUNCH_PARAM_END
|
||||
};
|
||||
|
||||
hip_impl::hipLaunchKernelGGLImpl(
|
||||
reinterpret_cast<std::uintptr_t>(kernel),
|
||||
numBlocks,
|
||||
dimBlocks,
|
||||
sharedMemBytes,
|
||||
stream,
|
||||
&config[0]);
|
||||
}
|
||||
|
||||
template<typename... Args, typename F = void (*)(hipLaunchParm, Args...)>
|
||||
inline
|
||||
void hipLaunchKernel(
|
||||
F kernel,
|
||||
const dim3& numBlocks,
|
||||
const dim3& dimBlocks,
|
||||
std::uint32_t groupMemBytes,
|
||||
hipStream_t stream,
|
||||
Args... args)
|
||||
{
|
||||
hipLaunchKernelGGL(
|
||||
kernel,
|
||||
numBlocks,
|
||||
dimBlocks,
|
||||
groupMemBytes,
|
||||
stream,
|
||||
hipLaunchParm{},
|
||||
std::move(args)...);
|
||||
}
|
||||
|
||||
@@ -19,989 +19,12 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#if GENERIC_GRID_LAUNCH == 1
|
||||
|
||||
#include "concepts.hpp"
|
||||
#include "helpers.hpp"
|
||||
|
||||
#include "hc.hpp"
|
||||
#include "hip/hip_hcc.h"
|
||||
#include "hip_runtime.h"
|
||||
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <stdexcept>
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
|
||||
namespace hip_impl
|
||||
{
|
||||
namespace
|
||||
{
|
||||
struct New_grid_launch_tag {};
|
||||
struct Old_grid_launch_tag {};
|
||||
|
||||
template<typename C, typename D>
|
||||
class RAII_guard {
|
||||
D dtor_;
|
||||
public:
|
||||
RAII_guard() = default;
|
||||
|
||||
RAII_guard(const C& ctor, D dtor) : dtor_{std::move(dtor)}
|
||||
{
|
||||
ctor();
|
||||
}
|
||||
|
||||
RAII_guard(const RAII_guard&) = default;
|
||||
RAII_guard(RAII_guard&&) = default;
|
||||
|
||||
RAII_guard& operator=(const RAII_guard&) = default;
|
||||
RAII_guard& operator=(RAII_guard&&) = default;
|
||||
|
||||
~RAII_guard() { dtor_(); }
|
||||
};
|
||||
|
||||
template<typename C, typename D>
|
||||
RAII_guard<C, D> make_RAII_guard(const C& ctor, D dtor)
|
||||
{
|
||||
return RAII_guard<C, D>{ctor, std::move(dtor)};
|
||||
}
|
||||
|
||||
template<FunctionalProcedure F, typename... Ts>
|
||||
using is_new_grid_launch_t = typename std::conditional<
|
||||
is_callable<F(Ts...)>{},
|
||||
New_grid_launch_tag,
|
||||
Old_grid_launch_tag>::type;
|
||||
}
|
||||
|
||||
// TODO: - dispatch rank should be derived from the domain dimensions passed
|
||||
// in, and not always assumed to be 3;
|
||||
|
||||
template<FunctionalProcedure K, typename... Ts>
|
||||
requires(Domain<K> == {Ts...})
|
||||
inline
|
||||
void grid_launch_hip_impl_(
|
||||
New_grid_launch_tag,
|
||||
dim3 num_blocks,
|
||||
dim3 dim_blocks,
|
||||
int group_mem_bytes,
|
||||
const hc::accelerator_view& acc_v,
|
||||
K k)
|
||||
{
|
||||
const auto d = hc::extent<3>{
|
||||
num_blocks.z * dim_blocks.z,
|
||||
num_blocks.y * dim_blocks.y,
|
||||
num_blocks.x * dim_blocks.x}.tile_with_dynamic(
|
||||
dim_blocks.z,
|
||||
dim_blocks.y,
|
||||
dim_blocks.x,
|
||||
group_mem_bytes);
|
||||
|
||||
try {
|
||||
hc::parallel_for_each(acc_v, d, k);
|
||||
}
|
||||
catch (std::exception& ex) {
|
||||
std::cerr << "Failed in " << __func__ << ", with exception: "
|
||||
<< ex.what() << std::endl;
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: these are workarounds, they should be removed.
|
||||
|
||||
hc::accelerator_view lock_stream_hip_(hipStream_t&, void*&);
|
||||
void print_prelaunch_trace_(const char*, dim3, dim3, int, hipStream_t);
|
||||
void unlock_stream_hip_(
|
||||
hipStream_t, void*, const char*, hc::accelerator_view*);
|
||||
|
||||
template<FunctionalProcedure K, typename... Ts>
|
||||
requires(Domain<K> == {Ts...})
|
||||
inline
|
||||
void grid_launch_hip_impl_(
|
||||
New_grid_launch_tag,
|
||||
dim3 num_blocks,
|
||||
dim3 dim_blocks,
|
||||
int group_mem_bytes,
|
||||
hipStream_t stream,
|
||||
const char* kernel_name,
|
||||
K k)
|
||||
{
|
||||
void* lck_stream = nullptr;
|
||||
auto acc_v = lock_stream_hip_(stream, lck_stream);
|
||||
auto stream_guard = make_RAII_guard(
|
||||
std::bind(
|
||||
print_prelaunch_trace_,
|
||||
kernel_name,
|
||||
num_blocks,
|
||||
dim_blocks,
|
||||
group_mem_bytes,
|
||||
stream),
|
||||
std::bind(
|
||||
unlock_stream_hip_, stream, lck_stream, kernel_name, &acc_v));
|
||||
|
||||
try {
|
||||
grid_launch_hip_impl_(
|
||||
New_grid_launch_tag{},
|
||||
std::move(num_blocks),
|
||||
std::move(dim_blocks),
|
||||
group_mem_bytes,
|
||||
acc_v,
|
||||
std::move(k));
|
||||
}
|
||||
catch (std::exception& ex) {
|
||||
std::cerr << "Failed in " << __func__ << ", with exception: "
|
||||
<< ex.what() << std::endl;
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
template<FunctionalProcedure K, typename... Ts>
|
||||
requires(Domain<K> == {hipLaunchParm, Ts...})
|
||||
inline
|
||||
void grid_launch_hip_impl_(
|
||||
Old_grid_launch_tag,
|
||||
dim3 num_blocks,
|
||||
dim3 dim_blocks,
|
||||
int group_mem_bytes,
|
||||
hipStream_t stream,
|
||||
K k)
|
||||
{
|
||||
grid_launch_hip_impl_(
|
||||
New_grid_launch_tag{},
|
||||
std::move(num_blocks),
|
||||
std::move(dim_blocks),
|
||||
group_mem_bytes,
|
||||
std::move(stream),
|
||||
std::move(k));
|
||||
}
|
||||
|
||||
template<FunctionalProcedure K, typename... Ts>
|
||||
requires(Domain<K> == {hipLaunchParm, Ts...})
|
||||
inline
|
||||
void grid_launch_hip_impl_(
|
||||
Old_grid_launch_tag,
|
||||
dim3 num_blocks,
|
||||
dim3 dim_blocks,
|
||||
int group_mem_bytes,
|
||||
hipStream_t stream,
|
||||
const char* kernel_name,
|
||||
K k)
|
||||
{
|
||||
grid_launch_hip_impl_(
|
||||
New_grid_launch_tag{},
|
||||
std::move(num_blocks),
|
||||
std::move(dim_blocks),
|
||||
group_mem_bytes,
|
||||
std::move(stream),
|
||||
kernel_name,
|
||||
std::move(k));
|
||||
}
|
||||
|
||||
template<FunctionalProcedure K, typename... Ts>
|
||||
requires(Domain<K> == {Ts...})
|
||||
inline
|
||||
std::enable_if_t<!std::is_function<K>::value> grid_launch_hip_(
|
||||
dim3 num_blocks,
|
||||
dim3 dim_blocks,
|
||||
int group_mem_bytes,
|
||||
hipStream_t stream,
|
||||
const char* kernel_name,
|
||||
K k)
|
||||
{
|
||||
grid_launch_hip_impl_(
|
||||
is_new_grid_launch_t<K, Ts...>{},
|
||||
std::move(num_blocks),
|
||||
std::move(dim_blocks),
|
||||
group_mem_bytes,
|
||||
std::move(stream),
|
||||
kernel_name,
|
||||
std::move(k));
|
||||
}
|
||||
|
||||
template<FunctionalProcedure K, typename... Ts>
|
||||
requires(Domain<K> == {Ts...})
|
||||
inline
|
||||
std::enable_if_t<!std::is_function<K>::value> grid_launch_hip_(
|
||||
dim3 num_blocks,
|
||||
dim3 dim_blocks,
|
||||
int group_mem_bytes,
|
||||
hipStream_t stream,
|
||||
K k)
|
||||
{
|
||||
grid_launch_hip_impl_(
|
||||
is_new_grid_launch_t<K, Ts...>{},
|
||||
std::move(num_blocks),
|
||||
std::move(dim_blocks),
|
||||
group_mem_bytes,
|
||||
std::move(stream),
|
||||
std::move(k));
|
||||
}
|
||||
|
||||
// TODO: these are temporary and purposefully noisy and disruptive.
|
||||
#define make_kernel_name_hip(k, n)\
|
||||
HIP_kernel_functor_name_begin ## _ ## k ## _ ## \
|
||||
HIP_kernel_functor_name_end ## _ ## n
|
||||
|
||||
#define make_kernel_functor_hip_30(\
|
||||
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
|
||||
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\
|
||||
p24, p25, p26, p27)\
|
||||
struct make_kernel_name_hip(function_name, 28) {\
|
||||
std::decay_t<decltype(p0)> _p0_;\
|
||||
std::decay_t<decltype(p1)> _p1_;\
|
||||
std::decay_t<decltype(p2)> _p2_;\
|
||||
std::decay_t<decltype(p3)> _p3_;\
|
||||
std::decay_t<decltype(p4)> _p4_;\
|
||||
std::decay_t<decltype(p5)> _p5_;\
|
||||
std::decay_t<decltype(p6)> _p6_;\
|
||||
std::decay_t<decltype(p7)> _p7_;\
|
||||
std::decay_t<decltype(p8)> _p8_;\
|
||||
std::decay_t<decltype(p9)> _p9_;\
|
||||
std::decay_t<decltype(p10)> _p10_;\
|
||||
std::decay_t<decltype(p11)> _p11_;\
|
||||
std::decay_t<decltype(p12)> _p12_;\
|
||||
std::decay_t<decltype(p13)> _p13_;\
|
||||
std::decay_t<decltype(p14)> _p14_;\
|
||||
std::decay_t<decltype(p15)> _p15_;\
|
||||
std::decay_t<decltype(p16)> _p16_;\
|
||||
std::decay_t<decltype(p17)> _p17_;\
|
||||
std::decay_t<decltype(p18)> _p18_;\
|
||||
std::decay_t<decltype(p19)> _p19_;\
|
||||
std::decay_t<decltype(p20)> _p20_;\
|
||||
std::decay_t<decltype(p21)> _p21_;\
|
||||
std::decay_t<decltype(p22)> _p22_;\
|
||||
std::decay_t<decltype(p23)> _p23_;\
|
||||
std::decay_t<decltype(p24)> _p24_;\
|
||||
std::decay_t<decltype(p25)> _p25_;\
|
||||
std::decay_t<decltype(p26)> _p26_;\
|
||||
std::decay_t<decltype(p27)> _p27_;\
|
||||
void operator()(const hc::tiled_index<3>&) const [[hc]]\
|
||||
{\
|
||||
kernel_name(\
|
||||
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
|
||||
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
|
||||
_p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_,\
|
||||
_p26_, _p27_);\
|
||||
}\
|
||||
}
|
||||
#define make_kernel_functor_hip_29(\
|
||||
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
|
||||
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\
|
||||
p24, p25, p26)\
|
||||
struct make_kernel_name_hip(function_name, 27) {\
|
||||
std::decay_t<decltype(p0)> _p0_;\
|
||||
std::decay_t<decltype(p1)> _p1_;\
|
||||
std::decay_t<decltype(p2)> _p2_;\
|
||||
std::decay_t<decltype(p3)> _p3_;\
|
||||
std::decay_t<decltype(p4)> _p4_;\
|
||||
std::decay_t<decltype(p5)> _p5_;\
|
||||
std::decay_t<decltype(p6)> _p6_;\
|
||||
std::decay_t<decltype(p7)> _p7_;\
|
||||
std::decay_t<decltype(p8)> _p8_;\
|
||||
std::decay_t<decltype(p9)> _p9_;\
|
||||
std::decay_t<decltype(p10)> _p10_;\
|
||||
std::decay_t<decltype(p11)> _p11_;\
|
||||
std::decay_t<decltype(p12)> _p12_;\
|
||||
std::decay_t<decltype(p13)> _p13_;\
|
||||
std::decay_t<decltype(p14)> _p14_;\
|
||||
std::decay_t<decltype(p15)> _p15_;\
|
||||
std::decay_t<decltype(p16)> _p16_;\
|
||||
std::decay_t<decltype(p17)> _p17_;\
|
||||
std::decay_t<decltype(p18)> _p18_;\
|
||||
std::decay_t<decltype(p19)> _p19_;\
|
||||
std::decay_t<decltype(p20)> _p20_;\
|
||||
std::decay_t<decltype(p21)> _p21_;\
|
||||
std::decay_t<decltype(p22)> _p22_;\
|
||||
std::decay_t<decltype(p23)> _p23_;\
|
||||
std::decay_t<decltype(p24)> _p24_;\
|
||||
std::decay_t<decltype(p25)> _p25_;\
|
||||
std::decay_t<decltype(p26)> _p26_;\
|
||||
void operator()(const hc::tiled_index<3>&) const [[hc]]\
|
||||
{\
|
||||
kernel_name(\
|
||||
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
|
||||
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
|
||||
_p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_,\
|
||||
_p26_);\
|
||||
}\
|
||||
}
|
||||
#define make_kernel_functor_hip_28(\
|
||||
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
|
||||
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\
|
||||
p24, p25)\
|
||||
struct make_kernel_name_hip(function_name, 26) {\
|
||||
std::decay_t<decltype(p0)> _p0_;\
|
||||
std::decay_t<decltype(p1)> _p1_;\
|
||||
std::decay_t<decltype(p2)> _p2_;\
|
||||
std::decay_t<decltype(p3)> _p3_;\
|
||||
std::decay_t<decltype(p4)> _p4_;\
|
||||
std::decay_t<decltype(p5)> _p5_;\
|
||||
std::decay_t<decltype(p6)> _p6_;\
|
||||
std::decay_t<decltype(p7)> _p7_;\
|
||||
std::decay_t<decltype(p8)> _p8_;\
|
||||
std::decay_t<decltype(p9)> _p9_;\
|
||||
std::decay_t<decltype(p10)> _p10_;\
|
||||
std::decay_t<decltype(p11)> _p11_;\
|
||||
std::decay_t<decltype(p12)> _p12_;\
|
||||
std::decay_t<decltype(p13)> _p13_;\
|
||||
std::decay_t<decltype(p14)> _p14_;\
|
||||
std::decay_t<decltype(p15)> _p15_;\
|
||||
std::decay_t<decltype(p16)> _p16_;\
|
||||
std::decay_t<decltype(p17)> _p17_;\
|
||||
std::decay_t<decltype(p18)> _p18_;\
|
||||
std::decay_t<decltype(p19)> _p19_;\
|
||||
std::decay_t<decltype(p20)> _p20_;\
|
||||
std::decay_t<decltype(p21)> _p21_;\
|
||||
std::decay_t<decltype(p22)> _p22_;\
|
||||
std::decay_t<decltype(p23)> _p23_;\
|
||||
std::decay_t<decltype(p24)> _p24_;\
|
||||
std::decay_t<decltype(p25)> _p25_;\
|
||||
void operator()(const hc::tiled_index<3>&) const [[hc]]\
|
||||
{\
|
||||
kernel_name(\
|
||||
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
|
||||
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
|
||||
_p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_);\
|
||||
}\
|
||||
}
|
||||
#define make_kernel_functor_hip_27(\
|
||||
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
|
||||
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\
|
||||
p24)\
|
||||
struct make_kernel_name_hip(function_name, 25) {\
|
||||
std::decay_t<decltype(p0)> _p0_;\
|
||||
std::decay_t<decltype(p1)> _p1_;\
|
||||
std::decay_t<decltype(p2)> _p2_;\
|
||||
std::decay_t<decltype(p3)> _p3_;\
|
||||
std::decay_t<decltype(p4)> _p4_;\
|
||||
std::decay_t<decltype(p5)> _p5_;\
|
||||
std::decay_t<decltype(p6)> _p6_;\
|
||||
std::decay_t<decltype(p7)> _p7_;\
|
||||
std::decay_t<decltype(p8)> _p8_;\
|
||||
std::decay_t<decltype(p9)> _p9_;\
|
||||
std::decay_t<decltype(p10)> _p10_;\
|
||||
std::decay_t<decltype(p11)> _p11_;\
|
||||
std::decay_t<decltype(p12)> _p12_;\
|
||||
std::decay_t<decltype(p13)> _p13_;\
|
||||
std::decay_t<decltype(p14)> _p14_;\
|
||||
std::decay_t<decltype(p15)> _p15_;\
|
||||
std::decay_t<decltype(p16)> _p16_;\
|
||||
std::decay_t<decltype(p17)> _p17_;\
|
||||
std::decay_t<decltype(p18)> _p18_;\
|
||||
std::decay_t<decltype(p19)> _p19_;\
|
||||
std::decay_t<decltype(p20)> _p20_;\
|
||||
std::decay_t<decltype(p21)> _p21_;\
|
||||
std::decay_t<decltype(p22)> _p22_;\
|
||||
std::decay_t<decltype(p23)> _p23_;\
|
||||
std::decay_t<decltype(p24)> _p24_;\
|
||||
void operator()(const hc::tiled_index<3>&) const [[hc]]\
|
||||
{\
|
||||
kernel_name(\
|
||||
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
|
||||
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
|
||||
_p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_);\
|
||||
}\
|
||||
}
|
||||
#define make_kernel_functor_hip_26(\
|
||||
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
|
||||
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23)\
|
||||
struct make_kernel_name_hip(function_name, 24) {\
|
||||
std::decay_t<decltype(p0)> _p0_;\
|
||||
std::decay_t<decltype(p1)> _p1_;\
|
||||
std::decay_t<decltype(p2)> _p2_;\
|
||||
std::decay_t<decltype(p3)> _p3_;\
|
||||
std::decay_t<decltype(p4)> _p4_;\
|
||||
std::decay_t<decltype(p5)> _p5_;\
|
||||
std::decay_t<decltype(p6)> _p6_;\
|
||||
std::decay_t<decltype(p7)> _p7_;\
|
||||
std::decay_t<decltype(p8)> _p8_;\
|
||||
std::decay_t<decltype(p9)> _p9_;\
|
||||
std::decay_t<decltype(p10)> _p10_;\
|
||||
std::decay_t<decltype(p11)> _p11_;\
|
||||
std::decay_t<decltype(p12)> _p12_;\
|
||||
std::decay_t<decltype(p13)> _p13_;\
|
||||
std::decay_t<decltype(p14)> _p14_;\
|
||||
std::decay_t<decltype(p15)> _p15_;\
|
||||
std::decay_t<decltype(p16)> _p16_;\
|
||||
std::decay_t<decltype(p17)> _p17_;\
|
||||
std::decay_t<decltype(p18)> _p18_;\
|
||||
std::decay_t<decltype(p19)> _p19_;\
|
||||
std::decay_t<decltype(p20)> _p20_;\
|
||||
std::decay_t<decltype(p21)> _p21_;\
|
||||
std::decay_t<decltype(p22)> _p22_;\
|
||||
std::decay_t<decltype(p23)> _p23_;\
|
||||
void operator()(const hc::tiled_index<3>&) const [[hc]]\
|
||||
{\
|
||||
kernel_name(\
|
||||
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
|
||||
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
|
||||
_p18_, _p19_, _p20_, _p21_, _p22_, _p23_);\
|
||||
}\
|
||||
}
|
||||
#define make_kernel_functor_hip_25(\
|
||||
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
|
||||
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22)\
|
||||
struct make_kernel_name_hip(function_name, 23) {\
|
||||
std::decay_t<decltype(p0)> _p0_;\
|
||||
std::decay_t<decltype(p1)> _p1_;\
|
||||
std::decay_t<decltype(p2)> _p2_;\
|
||||
std::decay_t<decltype(p3)> _p3_;\
|
||||
std::decay_t<decltype(p4)> _p4_;\
|
||||
std::decay_t<decltype(p5)> _p5_;\
|
||||
std::decay_t<decltype(p6)> _p6_;\
|
||||
std::decay_t<decltype(p7)> _p7_;\
|
||||
std::decay_t<decltype(p8)> _p8_;\
|
||||
std::decay_t<decltype(p9)> _p9_;\
|
||||
std::decay_t<decltype(p10)> _p10_;\
|
||||
std::decay_t<decltype(p11)> _p11_;\
|
||||
std::decay_t<decltype(p12)> _p12_;\
|
||||
std::decay_t<decltype(p13)> _p13_;\
|
||||
std::decay_t<decltype(p14)> _p14_;\
|
||||
std::decay_t<decltype(p15)> _p15_;\
|
||||
std::decay_t<decltype(p16)> _p16_;\
|
||||
std::decay_t<decltype(p17)> _p17_;\
|
||||
std::decay_t<decltype(p18)> _p18_;\
|
||||
std::decay_t<decltype(p19)> _p19_;\
|
||||
std::decay_t<decltype(p20)> _p20_;\
|
||||
std::decay_t<decltype(p21)> _p21_;\
|
||||
std::decay_t<decltype(p22)> _p22_;\
|
||||
__attribute__((used, flatten))\
|
||||
void operator()(const hc::tiled_index<3>&) const [[hc]]\
|
||||
{\
|
||||
kernel_name(\
|
||||
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
|
||||
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
|
||||
_p18_, _p19_, _p20_, _p21_, _p22_);\
|
||||
}\
|
||||
}
|
||||
#define make_kernel_functor_hip_24(\
|
||||
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
|
||||
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21)\
|
||||
struct make_kernel_name_hip(function_name, 22) {\
|
||||
std::decay_t<decltype(p0)> _p0_;\
|
||||
std::decay_t<decltype(p1)> _p1_;\
|
||||
std::decay_t<decltype(p2)> _p2_;\
|
||||
std::decay_t<decltype(p3)> _p3_;\
|
||||
std::decay_t<decltype(p4)> _p4_;\
|
||||
std::decay_t<decltype(p5)> _p5_;\
|
||||
std::decay_t<decltype(p6)> _p6_;\
|
||||
std::decay_t<decltype(p7)> _p7_;\
|
||||
std::decay_t<decltype(p8)> _p8_;\
|
||||
std::decay_t<decltype(p9)> _p9_;\
|
||||
std::decay_t<decltype(p10)> _p10_;\
|
||||
std::decay_t<decltype(p11)> _p11_;\
|
||||
std::decay_t<decltype(p12)> _p12_;\
|
||||
std::decay_t<decltype(p13)> _p13_;\
|
||||
std::decay_t<decltype(p14)> _p14_;\
|
||||
std::decay_t<decltype(p15)> _p15_;\
|
||||
std::decay_t<decltype(p16)> _p16_;\
|
||||
std::decay_t<decltype(p17)> _p17_;\
|
||||
std::decay_t<decltype(p18)> _p18_;\
|
||||
std::decay_t<decltype(p19)> _p19_;\
|
||||
std::decay_t<decltype(p20)> _p20_;\
|
||||
std::decay_t<decltype(p21)> _p21_;\
|
||||
void operator()(const hc::tiled_index<3>&) const [[hc]]\
|
||||
{\
|
||||
kernel_name(\
|
||||
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
|
||||
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
|
||||
_p18_, _p19_, _p20_, _p21_);\
|
||||
}\
|
||||
}
|
||||
#define make_kernel_functor_hip_23(\
|
||||
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
|
||||
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20)\
|
||||
struct make_kernel_name_hip(function_name, 21) {\
|
||||
std::decay_t<decltype(p0)> _p0_;\
|
||||
std::decay_t<decltype(p1)> _p1_;\
|
||||
std::decay_t<decltype(p2)> _p2_;\
|
||||
std::decay_t<decltype(p3)> _p3_;\
|
||||
std::decay_t<decltype(p4)> _p4_;\
|
||||
std::decay_t<decltype(p5)> _p5_;\
|
||||
std::decay_t<decltype(p6)> _p6_;\
|
||||
std::decay_t<decltype(p7)> _p7_;\
|
||||
std::decay_t<decltype(p8)> _p8_;\
|
||||
std::decay_t<decltype(p9)> _p9_;\
|
||||
std::decay_t<decltype(p10)> _p10_;\
|
||||
std::decay_t<decltype(p11)> _p11_;\
|
||||
std::decay_t<decltype(p12)> _p12_;\
|
||||
std::decay_t<decltype(p13)> _p13_;\
|
||||
std::decay_t<decltype(p14)> _p14_;\
|
||||
std::decay_t<decltype(p15)> _p15_;\
|
||||
std::decay_t<decltype(p16)> _p16_;\
|
||||
std::decay_t<decltype(p17)> _p17_;\
|
||||
std::decay_t<decltype(p18)> _p18_;\
|
||||
std::decay_t<decltype(p19)> _p19_;\
|
||||
std::decay_t<decltype(p20)> _p20_;\
|
||||
void operator()(const hc::tiled_index<3>&) const [[hc]]\
|
||||
{\
|
||||
kernel_name(\
|
||||
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
|
||||
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
|
||||
_p18_, _p19_, _p20_);\
|
||||
}\
|
||||
}
|
||||
#define make_kernel_functor_hip_22(\
|
||||
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
|
||||
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19)\
|
||||
struct make_kernel_name_hip(function_name, 20) {\
|
||||
std::decay_t<decltype(p0)> _p0_;\
|
||||
std::decay_t<decltype(p1)> _p1_;\
|
||||
std::decay_t<decltype(p2)> _p2_;\
|
||||
std::decay_t<decltype(p3)> _p3_;\
|
||||
std::decay_t<decltype(p4)> _p4_;\
|
||||
std::decay_t<decltype(p5)> _p5_;\
|
||||
std::decay_t<decltype(p6)> _p6_;\
|
||||
std::decay_t<decltype(p7)> _p7_;\
|
||||
std::decay_t<decltype(p8)> _p8_;\
|
||||
std::decay_t<decltype(p9)> _p9_;\
|
||||
std::decay_t<decltype(p10)> _p10_;\
|
||||
std::decay_t<decltype(p11)> _p11_;\
|
||||
std::decay_t<decltype(p12)> _p12_;\
|
||||
std::decay_t<decltype(p13)> _p13_;\
|
||||
std::decay_t<decltype(p14)> _p14_;\
|
||||
std::decay_t<decltype(p15)> _p15_;\
|
||||
std::decay_t<decltype(p16)> _p16_;\
|
||||
std::decay_t<decltype(p17)> _p17_;\
|
||||
std::decay_t<decltype(p18)> _p18_;\
|
||||
std::decay_t<decltype(p19)> _p19_;\
|
||||
void operator()(const hc::tiled_index<3>&) const [[hc]]\
|
||||
{\
|
||||
kernel_name(\
|
||||
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
|
||||
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
|
||||
_p18_, _p19_);\
|
||||
}\
|
||||
}
|
||||
#define make_kernel_functor_hip_21(\
|
||||
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
|
||||
p10, p11, p12, p13, p14, p15, p16, p17, p18)\
|
||||
struct make_kernel_name_hip(function_name, 19) {\
|
||||
std::decay_t<decltype(p0)> _p0_;\
|
||||
std::decay_t<decltype(p1)> _p1_;\
|
||||
std::decay_t<decltype(p2)> _p2_;\
|
||||
std::decay_t<decltype(p3)> _p3_;\
|
||||
std::decay_t<decltype(p4)> _p4_;\
|
||||
std::decay_t<decltype(p5)> _p5_;\
|
||||
std::decay_t<decltype(p6)> _p6_;\
|
||||
std::decay_t<decltype(p7)> _p7_;\
|
||||
std::decay_t<decltype(p8)> _p8_;\
|
||||
std::decay_t<decltype(p9)> _p9_;\
|
||||
std::decay_t<decltype(p10)> _p10_;\
|
||||
std::decay_t<decltype(p11)> _p11_;\
|
||||
std::decay_t<decltype(p12)> _p12_;\
|
||||
std::decay_t<decltype(p13)> _p13_;\
|
||||
std::decay_t<decltype(p14)> _p14_;\
|
||||
std::decay_t<decltype(p15)> _p15_;\
|
||||
std::decay_t<decltype(p16)> _p16_;\
|
||||
std::decay_t<decltype(p17)> _p17_;\
|
||||
std::decay_t<decltype(p18)> _p18_;\
|
||||
void operator()(const hc::tiled_index<3>&) const [[hc]]\
|
||||
{\
|
||||
kernel_name(\
|
||||
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
|
||||
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
|
||||
_p18_);\
|
||||
}\
|
||||
}
|
||||
#define make_kernel_functor_hip_20(\
|
||||
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
|
||||
p10, p11, p12, p13, p14, p15, p16, p17)\
|
||||
struct make_kernel_name_hip(function_name, 18) {\
|
||||
std::decay_t<decltype(p0)> _p0_;\
|
||||
std::decay_t<decltype(p1)> _p1_;\
|
||||
std::decay_t<decltype(p2)> _p2_;\
|
||||
std::decay_t<decltype(p3)> _p3_;\
|
||||
std::decay_t<decltype(p4)> _p4_;\
|
||||
std::decay_t<decltype(p5)> _p5_;\
|
||||
std::decay_t<decltype(p6)> _p6_;\
|
||||
std::decay_t<decltype(p7)> _p7_;\
|
||||
std::decay_t<decltype(p8)> _p8_;\
|
||||
std::decay_t<decltype(p9)> _p9_;\
|
||||
std::decay_t<decltype(p10)> _p10_;\
|
||||
std::decay_t<decltype(p11)> _p11_;\
|
||||
std::decay_t<decltype(p12)> _p12_;\
|
||||
std::decay_t<decltype(p13)> _p13_;\
|
||||
std::decay_t<decltype(p14)> _p14_;\
|
||||
std::decay_t<decltype(p15)> _p15_;\
|
||||
std::decay_t<decltype(p16)> _p16_;\
|
||||
std::decay_t<decltype(p17)> _p17_;\
|
||||
void operator()(const hc::tiled_index<3>&) const [[hc]]\
|
||||
{\
|
||||
kernel_name(\
|
||||
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
|
||||
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_);\
|
||||
}\
|
||||
}
|
||||
#define make_kernel_functor_hip_19(\
|
||||
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
|
||||
p10, p11, p12, p13, p14, p15, p16)\
|
||||
struct make_kernel_name_hip(function_name, 17) {\
|
||||
std::decay_t<decltype(p0)> _p0_;\
|
||||
std::decay_t<decltype(p1)> _p1_;\
|
||||
std::decay_t<decltype(p2)> _p2_;\
|
||||
std::decay_t<decltype(p3)> _p3_;\
|
||||
std::decay_t<decltype(p4)> _p4_;\
|
||||
std::decay_t<decltype(p5)> _p5_;\
|
||||
std::decay_t<decltype(p6)> _p6_;\
|
||||
std::decay_t<decltype(p7)> _p7_;\
|
||||
std::decay_t<decltype(p8)> _p8_;\
|
||||
std::decay_t<decltype(p9)> _p9_;\
|
||||
std::decay_t<decltype(p10)> _p10_;\
|
||||
std::decay_t<decltype(p11)> _p11_;\
|
||||
std::decay_t<decltype(p12)> _p12_;\
|
||||
std::decay_t<decltype(p13)> _p13_;\
|
||||
std::decay_t<decltype(p14)> _p14_;\
|
||||
std::decay_t<decltype(p15)> _p15_;\
|
||||
std::decay_t<decltype(p16)> _p16_;\
|
||||
void operator()(const hc::tiled_index<3>&) const [[hc]]\
|
||||
{\
|
||||
kernel_name(\
|
||||
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
|
||||
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_);\
|
||||
}\
|
||||
}
|
||||
#define make_kernel_functor_hip_18(\
|
||||
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
|
||||
p10, p11, p12, p13, p14, p15)\
|
||||
struct make_kernel_name_hip(function_name, 16) {\
|
||||
std::decay_t<decltype(p0)> _p0_;\
|
||||
std::decay_t<decltype(p1)> _p1_;\
|
||||
std::decay_t<decltype(p2)> _p2_;\
|
||||
std::decay_t<decltype(p3)> _p3_;\
|
||||
std::decay_t<decltype(p4)> _p4_;\
|
||||
std::decay_t<decltype(p5)> _p5_;\
|
||||
std::decay_t<decltype(p6)> _p6_;\
|
||||
std::decay_t<decltype(p7)> _p7_;\
|
||||
std::decay_t<decltype(p8)> _p8_;\
|
||||
std::decay_t<decltype(p9)> _p9_;\
|
||||
std::decay_t<decltype(p10)> _p10_;\
|
||||
std::decay_t<decltype(p11)> _p11_;\
|
||||
std::decay_t<decltype(p12)> _p12_;\
|
||||
std::decay_t<decltype(p13)> _p13_;\
|
||||
std::decay_t<decltype(p14)> _p14_;\
|
||||
std::decay_t<decltype(p15)> _p15_;\
|
||||
void operator()(const hc::tiled_index<3>&) const [[hc]]\
|
||||
{\
|
||||
kernel_name(\
|
||||
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
|
||||
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_);\
|
||||
}\
|
||||
}
|
||||
#define make_kernel_functor_hip_17(\
|
||||
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
|
||||
p10, p11, p12, p13, p14)\
|
||||
struct make_kernel_name_hip(function_name, 15) {\
|
||||
std::decay_t<decltype(p0)> _p0_;\
|
||||
std::decay_t<decltype(p1)> _p1_;\
|
||||
std::decay_t<decltype(p2)> _p2_;\
|
||||
std::decay_t<decltype(p3)> _p3_;\
|
||||
std::decay_t<decltype(p4)> _p4_;\
|
||||
std::decay_t<decltype(p5)> _p5_;\
|
||||
std::decay_t<decltype(p6)> _p6_;\
|
||||
std::decay_t<decltype(p7)> _p7_;\
|
||||
std::decay_t<decltype(p8)> _p8_;\
|
||||
std::decay_t<decltype(p9)> _p9_;\
|
||||
std::decay_t<decltype(p10)> _p10_;\
|
||||
std::decay_t<decltype(p11)> _p11_;\
|
||||
std::decay_t<decltype(p12)> _p12_;\
|
||||
std::decay_t<decltype(p13)> _p13_;\
|
||||
std::decay_t<decltype(p14)> _p14_;\
|
||||
void operator()(const hc::tiled_index<3>&) const [[hc]]\
|
||||
{\
|
||||
kernel_name(\
|
||||
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
|
||||
_p10_, _p11_, _p12_, _p13_, _p14_);\
|
||||
}\
|
||||
}
|
||||
#define make_kernel_functor_hip_16(\
|
||||
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
|
||||
p10, p11, p12, p13)\
|
||||
struct make_kernel_name_hip(function_name, 14) {\
|
||||
std::decay_t<decltype(p0)> _p0_;\
|
||||
std::decay_t<decltype(p1)> _p1_;\
|
||||
std::decay_t<decltype(p2)> _p2_;\
|
||||
std::decay_t<decltype(p3)> _p3_;\
|
||||
std::decay_t<decltype(p4)> _p4_;\
|
||||
std::decay_t<decltype(p5)> _p5_;\
|
||||
std::decay_t<decltype(p6)> _p6_;\
|
||||
std::decay_t<decltype(p7)> _p7_;\
|
||||
std::decay_t<decltype(p8)> _p8_;\
|
||||
std::decay_t<decltype(p9)> _p9_;\
|
||||
std::decay_t<decltype(p10)> _p10_;\
|
||||
std::decay_t<decltype(p11)> _p11_;\
|
||||
std::decay_t<decltype(p12)> _p12_;\
|
||||
std::decay_t<decltype(p13)> _p13_;\
|
||||
void operator()(const hc::tiled_index<3>&) const [[hc]]\
|
||||
{\
|
||||
kernel_name(\
|
||||
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
|
||||
_p10_, _p11_, _p12_, _p13_);\
|
||||
}\
|
||||
}
|
||||
#define make_kernel_functor_hip_15(\
|
||||
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
|
||||
p10, p11, p12)\
|
||||
struct make_kernel_name_hip(function_name, 13) {\
|
||||
std::decay_t<decltype(p0)> _p0_;\
|
||||
std::decay_t<decltype(p1)> _p1_;\
|
||||
std::decay_t<decltype(p2)> _p2_;\
|
||||
std::decay_t<decltype(p3)> _p3_;\
|
||||
std::decay_t<decltype(p4)> _p4_;\
|
||||
std::decay_t<decltype(p5)> _p5_;\
|
||||
std::decay_t<decltype(p6)> _p6_;\
|
||||
std::decay_t<decltype(p7)> _p7_;\
|
||||
std::decay_t<decltype(p8)> _p8_;\
|
||||
std::decay_t<decltype(p9)> _p9_;\
|
||||
std::decay_t<decltype(p10)> _p10_;\
|
||||
std::decay_t<decltype(p11)> _p11_;\
|
||||
std::decay_t<decltype(p12)> _p12_;\
|
||||
void operator()(const hc::tiled_index<3>&) const [[hc]]\
|
||||
{\
|
||||
kernel_name(\
|
||||
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
|
||||
_p10_, _p11_, _p12_);\
|
||||
}\
|
||||
}
|
||||
#define make_kernel_functor_hip_14(\
|
||||
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
|
||||
p10, p11)\
|
||||
struct make_kernel_name_hip(function_name, 12) {\
|
||||
std::decay_t<decltype(p0)> _p0_;\
|
||||
std::decay_t<decltype(p1)> _p1_;\
|
||||
std::decay_t<decltype(p2)> _p2_;\
|
||||
std::decay_t<decltype(p3)> _p3_;\
|
||||
std::decay_t<decltype(p4)> _p4_;\
|
||||
std::decay_t<decltype(p5)> _p5_;\
|
||||
std::decay_t<decltype(p6)> _p6_;\
|
||||
std::decay_t<decltype(p7)> _p7_;\
|
||||
std::decay_t<decltype(p8)> _p8_;\
|
||||
std::decay_t<decltype(p9)> _p9_;\
|
||||
std::decay_t<decltype(p10)> _p10_;\
|
||||
std::decay_t<decltype(p11)> _p11_;\
|
||||
void operator()(const hc::tiled_index<3>&) const [[hc]]\
|
||||
{\
|
||||
kernel_name(\
|
||||
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
|
||||
_p10_, _p11_);\
|
||||
}\
|
||||
}
|
||||
#define make_kernel_functor_hip_13(\
|
||||
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10)\
|
||||
struct make_kernel_name_hip(function_name, 11) {\
|
||||
std::decay_t<decltype(p0)> _p0_;\
|
||||
std::decay_t<decltype(p1)> _p1_;\
|
||||
std::decay_t<decltype(p2)> _p2_;\
|
||||
std::decay_t<decltype(p3)> _p3_;\
|
||||
std::decay_t<decltype(p4)> _p4_;\
|
||||
std::decay_t<decltype(p5)> _p5_;\
|
||||
std::decay_t<decltype(p6)> _p6_;\
|
||||
std::decay_t<decltype(p7)> _p7_;\
|
||||
std::decay_t<decltype(p8)> _p8_;\
|
||||
std::decay_t<decltype(p9)> _p9_;\
|
||||
std::decay_t<decltype(p10)> _p10_;\
|
||||
void operator()(const hc::tiled_index<3>&) const [[hc]]\
|
||||
{\
|
||||
kernel_name(\
|
||||
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
|
||||
_p10_);\
|
||||
}\
|
||||
}
|
||||
#define make_kernel_functor_hip_12(\
|
||||
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9)\
|
||||
struct make_kernel_name_hip(function_name, 10) {\
|
||||
std::decay_t<decltype(p0)> _p0_;\
|
||||
std::decay_t<decltype(p1)> _p1_;\
|
||||
std::decay_t<decltype(p2)> _p2_;\
|
||||
std::decay_t<decltype(p3)> _p3_;\
|
||||
std::decay_t<decltype(p4)> _p4_;\
|
||||
std::decay_t<decltype(p5)> _p5_;\
|
||||
std::decay_t<decltype(p6)> _p6_;\
|
||||
std::decay_t<decltype(p7)> _p7_;\
|
||||
std::decay_t<decltype(p8)> _p8_;\
|
||||
std::decay_t<decltype(p9)> _p9_;\
|
||||
void operator()(const hc::tiled_index<3>&) const [[hc]]\
|
||||
{\
|
||||
kernel_name(\
|
||||
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\
|
||||
_p9_);\
|
||||
}\
|
||||
}
|
||||
#define make_kernel_functor_hip_11(\
|
||||
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8)\
|
||||
struct make_kernel_name_hip(function_name, 9) {\
|
||||
std::decay_t<decltype(p0)> _p0_;\
|
||||
std::decay_t<decltype(p1)> _p1_;\
|
||||
std::decay_t<decltype(p2)> _p2_;\
|
||||
std::decay_t<decltype(p3)> _p3_;\
|
||||
std::decay_t<decltype(p4)> _p4_;\
|
||||
std::decay_t<decltype(p5)> _p5_;\
|
||||
std::decay_t<decltype(p6)> _p6_;\
|
||||
std::decay_t<decltype(p7)> _p7_;\
|
||||
std::decay_t<decltype(p8)> _p8_;\
|
||||
void operator()(const hc::tiled_index<3>&) const [[hc]]\
|
||||
{\
|
||||
kernel_name(\
|
||||
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_);\
|
||||
}\
|
||||
}
|
||||
#define make_kernel_functor_hip_10(\
|
||||
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7)\
|
||||
struct make_kernel_name_hip(function_name, 8) {\
|
||||
std::decay_t<decltype(p0)> _p0_;\
|
||||
std::decay_t<decltype(p1)> _p1_;\
|
||||
std::decay_t<decltype(p2)> _p2_;\
|
||||
std::decay_t<decltype(p3)> _p3_;\
|
||||
std::decay_t<decltype(p4)> _p4_;\
|
||||
std::decay_t<decltype(p5)> _p5_;\
|
||||
std::decay_t<decltype(p6)> _p6_;\
|
||||
std::decay_t<decltype(p7)> _p7_;\
|
||||
void operator()(const hc::tiled_index<3>&) const [[hc]]\
|
||||
{\
|
||||
kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_);\
|
||||
}\
|
||||
}
|
||||
#define make_kernel_functor_hip_9(\
|
||||
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6)\
|
||||
struct make_kernel_name_hip(function_name, 7) {\
|
||||
std::decay_t<decltype(p0)> _p0_;\
|
||||
std::decay_t<decltype(p1)> _p1_;\
|
||||
std::decay_t<decltype(p2)> _p2_;\
|
||||
std::decay_t<decltype(p3)> _p3_;\
|
||||
std::decay_t<decltype(p4)> _p4_;\
|
||||
std::decay_t<decltype(p5)> _p5_;\
|
||||
std::decay_t<decltype(p6)> _p6_;\
|
||||
void operator()(const hc::tiled_index<3>&) const [[hc]]\
|
||||
{\
|
||||
kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_);\
|
||||
}\
|
||||
}
|
||||
#define make_kernel_functor_hip_8(\
|
||||
function_name, kernel_name, p0, p1, p2, p3, p4, p5)\
|
||||
struct make_kernel_name_hip(function_name, 6) {\
|
||||
std::decay_t<decltype(p0)> _p0_;\
|
||||
std::decay_t<decltype(p1)> _p1_;\
|
||||
std::decay_t<decltype(p2)> _p2_;\
|
||||
std::decay_t<decltype(p3)> _p3_;\
|
||||
std::decay_t<decltype(p4)> _p4_;\
|
||||
std::decay_t<decltype(p5)> _p5_;\
|
||||
void operator()(const hc::tiled_index<3>&) const [[hc]]\
|
||||
{\
|
||||
kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_);\
|
||||
}\
|
||||
}
|
||||
#define make_kernel_functor_hip_7(\
|
||||
function_name, kernel_name, p0, p1, p2, p3, p4)\
|
||||
struct make_kernel_name_hip(function_name, 5) {\
|
||||
std::decay_t<decltype(p0)> _p0_;\
|
||||
std::decay_t<decltype(p1)> _p1_;\
|
||||
std::decay_t<decltype(p2)> _p2_;\
|
||||
std::decay_t<decltype(p3)> _p3_;\
|
||||
std::decay_t<decltype(p4)> _p4_;\
|
||||
void operator()(const hc::tiled_index<3>&) const [[hc]]\
|
||||
{\
|
||||
kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_);\
|
||||
}\
|
||||
}
|
||||
#define make_kernel_functor_hip_6(function_name, kernel_name, p0, p1, p2, p3)\
|
||||
struct make_kernel_name_hip(function_name, 4) {\
|
||||
std::decay_t<decltype(p0)> _p0_;\
|
||||
std::decay_t<decltype(p1)> _p1_;\
|
||||
std::decay_t<decltype(p2)> _p2_;\
|
||||
std::decay_t<decltype(p3)> _p3_;\
|
||||
void operator()(const hc::tiled_index<3>&) const [[hc]]\
|
||||
{\
|
||||
kernel_name(_p0_, _p1_, _p2_, _p3_);\
|
||||
}\
|
||||
}
|
||||
#define make_kernel_functor_hip_5(function_name, kernel_name, p0, p1, p2)\
|
||||
struct make_kernel_name_hip(function_name, 3) {\
|
||||
std::decay_t<decltype(p0)> _p0_;\
|
||||
std::decay_t<decltype(p1)> _p1_;\
|
||||
std::decay_t<decltype(p2)> _p2_;\
|
||||
void operator()(const hc::tiled_index<3>&) const [[hc]]\
|
||||
{\
|
||||
kernel_name(_p0_, _p1_, _p2_);\
|
||||
}\
|
||||
}
|
||||
#define make_kernel_functor_hip_4(function_name, kernel_name, p0, p1)\
|
||||
struct make_kernel_name_hip(function_name, 2) {\
|
||||
std::decay_t<decltype(p0)> _p0_;\
|
||||
std::decay_t<decltype(p1)> _p1_;\
|
||||
void operator()(const hc::tiled_index<3>&) const [[hc]]\
|
||||
{\
|
||||
kernel_name(_p0_, _p1_);\
|
||||
}\
|
||||
}
|
||||
#define fofo(f, n) kernel_prefix_hip ## f ## kernel_suffix_hip ## n
|
||||
#define make_kernel_functor_hip_3(function_name, kernel_name, p0)\
|
||||
struct make_kernel_name_hip(function_name, 1) {\
|
||||
std::decay_t<decltype(p0)> _p0_;\
|
||||
void operator()(const hc::tiled_index<3>&) const [[hc]]\
|
||||
{\
|
||||
kernel_name(_p0_);\
|
||||
}\
|
||||
}
|
||||
#define make_kernel_functor_hip_2(function_name, kernel_name)\
|
||||
struct make_kernel_name_hip(function_name, 0) {\
|
||||
void operator()(const hc::tiled_index<3>&) [[hc]]\
|
||||
{\
|
||||
return kernel_name(hipLaunchParm{});\
|
||||
}\
|
||||
}
|
||||
#define make_kernel_functor_hip_1(...)
|
||||
#define make_kernel_functor_hip_0(...)
|
||||
#define make_kernel_functor_hip_(...)\
|
||||
overload_macro_hip_(make_kernel_functor_hip_, __VA_ARGS__)
|
||||
|
||||
|
||||
#define hipLaunchNamedKernelGGL(\
|
||||
function_name,\
|
||||
kernel_name,\
|
||||
num_blocks,\
|
||||
dim_blocks,\
|
||||
group_mem_bytes,\
|
||||
stream,\
|
||||
...)\
|
||||
do {\
|
||||
make_kernel_functor_hip_(function_name, kernel_name, __VA_ARGS__)\
|
||||
hip_kernel_functor_impl_{__VA_ARGS__};\
|
||||
hip_impl::grid_launch_hip_(\
|
||||
num_blocks,\
|
||||
dim_blocks,\
|
||||
group_mem_bytes,\
|
||||
stream,\
|
||||
#kernel_name,\
|
||||
hip_kernel_functor_impl_);\
|
||||
} while(0)
|
||||
|
||||
#define hipLaunchKernelGGL(\
|
||||
kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)\
|
||||
do {\
|
||||
hipLaunchNamedKernelGGL(\
|
||||
unnamed,\
|
||||
kernel_name,\
|
||||
num_blocks,\
|
||||
dim_blocks,\
|
||||
group_mem_bytes,\
|
||||
stream,\
|
||||
##__VA_ARGS__);\
|
||||
} while (0)
|
||||
|
||||
#define hipLaunchKernel(\
|
||||
kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)\
|
||||
do {\
|
||||
hipLaunchKernelGGL(\
|
||||
kernel_name,\
|
||||
num_blocks,\
|
||||
dim_blocks,\
|
||||
group_mem_bytes,\
|
||||
stream,\
|
||||
hipLaunchParm{},\
|
||||
##__VA_ARGS__);\
|
||||
} while(0)
|
||||
}
|
||||
#endif //GENERIC_GRID_LAUNCH
|
||||
#if __hcc_workweek__ >= 17481
|
||||
#include "functional_grid_launch.hpp"
|
||||
#else
|
||||
#include "macro_based_grid_launch.hpp"
|
||||
#endif
|
||||
#endif //GENERIC_GRID_LAUNCH
|
||||
@@ -44,7 +44,12 @@ THE SOFTWARE.
|
||||
#if GENERIC_GRID_LAUNCH == 0
|
||||
#define __global__ __attribute__((hc_grid_launch)) __attribute__((used))
|
||||
#else
|
||||
#define __global__ __attribute__((annotate("hip__global__"), hc, used, weak))
|
||||
#if __hcc_workweek__ >= 17481
|
||||
#define __global__ \
|
||||
__attribute__((annotate("__HIP_global_function__"), cpu, hc, used))
|
||||
#else
|
||||
#define __global__ __attribute__((hc, used))
|
||||
#endif
|
||||
#endif //GENERIC_GRID_LAUNCH
|
||||
|
||||
#define __noinline__ __attribute__((noinline))
|
||||
|
||||
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@@ -0,0 +1,84 @@
|
||||
/*
|
||||
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <hsa/hsa.h>
|
||||
#include <hsa/hsa_ext_amd.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <istream>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
struct ihipModuleSymbol_t;
|
||||
using hipFunction_t = ihipModuleSymbol_t*;
|
||||
|
||||
namespace std
|
||||
{
|
||||
template<>
|
||||
struct hash<hsa_agent_t> {
|
||||
size_t operator()(hsa_agent_t x) const
|
||||
{
|
||||
return hash<decltype(x.handle)>{}(x.handle);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
inline
|
||||
constexpr
|
||||
bool operator==(hsa_agent_t x, hsa_agent_t y)
|
||||
{
|
||||
return x.handle == y.handle;
|
||||
}
|
||||
|
||||
namespace hip_impl
|
||||
{
|
||||
struct Kernel_descriptor {
|
||||
std::uint64_t kernel_object_;
|
||||
std::uint32_t group_size_;
|
||||
std::uint32_t private_size_;
|
||||
std::string name_;
|
||||
|
||||
operator hipFunction_t() const
|
||||
{ // TODO: this is awful and only meant for illustration.
|
||||
return reinterpret_cast<hipFunction_t>(
|
||||
const_cast<Kernel_descriptor*>(this));
|
||||
}
|
||||
};
|
||||
|
||||
using RAII_global = std::unique_ptr<void, decltype(hsa_amd_memory_unlock)*>;
|
||||
|
||||
const std::unordered_map<
|
||||
hsa_agent_t, std::vector<hsa_executable_t>>& executables();
|
||||
const std::unordered_map<
|
||||
std::uintptr_t,
|
||||
std::vector<std::pair<hsa_agent_t, Kernel_descriptor>>>& functions();
|
||||
const std::unordered_map<std::uintptr_t, std::string>& function_names();
|
||||
std::unordered_map<std::string, RAII_global>& globals();
|
||||
|
||||
hsa_executable_t load_executable(
|
||||
hsa_executable_t executable, hsa_agent_t agent, std::istream& file);
|
||||
} // Namespace hip_impl.
|
||||
@@ -0,0 +1,39 @@
|
||||
#include "../include/hip/hcc_detail/code_object_bundle.hpp"
|
||||
|
||||
#include <hsa/hsa.h>
|
||||
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
hsa_isa_t hip_impl::triple_to_hsa_isa(const std::string& triple)
|
||||
{
|
||||
static constexpr const char prefix[] = "hcc-amdgcn--amdhsa-gfx";
|
||||
static constexpr std::size_t prefix_sz = sizeof(prefix) - 1;
|
||||
|
||||
hsa_isa_t r = {};
|
||||
|
||||
auto idx = triple.find(prefix);
|
||||
|
||||
if (idx != std::string::npos) {
|
||||
idx += prefix_sz;
|
||||
std::string tmp = "AMD:AMDGPU";
|
||||
while (idx != triple.size()) {
|
||||
tmp.push_back(':');
|
||||
tmp.push_back(triple[idx++]);
|
||||
}
|
||||
|
||||
hsa_isa_from_name(tmp.c_str(), &r);
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
// DATA - STATICS
|
||||
constexpr const char hip_impl::Bundled_code_header::magic_string_[];
|
||||
|
||||
// CREATORS
|
||||
hip_impl::Bundled_code_header::Bundled_code_header(
|
||||
const std::vector<std::uint8_t>& x)
|
||||
: Bundled_code_header{x.cbegin(), x.cend()}
|
||||
{}
|
||||
@@ -28,27 +28,6 @@ extern "C" float __ocml_rint_f32(float);
|
||||
extern "C" float __ocml_ceil_f32(float);
|
||||
extern "C" float __ocml_trunc_f32(float);
|
||||
|
||||
struct holder64Bit{
|
||||
union{
|
||||
double d;
|
||||
unsigned long int uli;
|
||||
signed long int sli;
|
||||
signed int si[2];
|
||||
unsigned int ui[2];
|
||||
};
|
||||
} __attribute__((aligned(8)));
|
||||
|
||||
struct holder32Bit {
|
||||
union {
|
||||
float f;
|
||||
unsigned int ui;
|
||||
signed int si;
|
||||
};
|
||||
} __attribute__((aligned(4)));
|
||||
|
||||
__device__ struct holder64Bit hold64;
|
||||
__device__ struct holder32Bit hold32;
|
||||
|
||||
__device__ float __double2float_rd(double x)
|
||||
{
|
||||
return (double)x;
|
||||
@@ -69,13 +48,21 @@ __device__ float __double2float_rz(double x)
|
||||
|
||||
__device__ int __double2hiint(double x)
|
||||
{
|
||||
hold64.d = x;
|
||||
return hold64.si[1];
|
||||
static_assert(sizeof(double) == 2 * sizeof(int), "");
|
||||
|
||||
int tmp[2];
|
||||
__builtin_memcpy(tmp, &x, sizeof(tmp));
|
||||
|
||||
return tmp[1];
|
||||
}
|
||||
__device__ int __double2loint(double x)
|
||||
{
|
||||
hold64.d = x;
|
||||
return hold64.si[0];
|
||||
static_assert(sizeof(double) == 2 * sizeof(int), "");
|
||||
|
||||
int tmp[2];
|
||||
__builtin_memcpy(tmp, &x, sizeof(tmp));
|
||||
|
||||
return tmp[0];
|
||||
}
|
||||
|
||||
|
||||
@@ -150,8 +137,12 @@ __device__ unsigned long long int __double2ull_rz(double x)
|
||||
|
||||
__device__ long long int __double_as_longlong(double x)
|
||||
{
|
||||
hold64.d = x;
|
||||
return hold64.sli;
|
||||
static_assert(sizeof(long long) == sizeof(double), "");
|
||||
|
||||
long long tmp;
|
||||
__builtin_memcpy(&tmp, &x, sizeof(tmp));
|
||||
|
||||
return tmp;
|
||||
}
|
||||
|
||||
__device__ int __float2int_rd(float x)
|
||||
@@ -224,19 +215,32 @@ __device__ unsigned long long int __float2ull_rz(float x)
|
||||
|
||||
__device__ int __float_as_int(float x)
|
||||
{
|
||||
hold32.f = x;
|
||||
return hold32.si;
|
||||
static_assert(sizeof(int) == sizeof(float), "");
|
||||
|
||||
int tmp;
|
||||
__builtin_memcpy(&tmp, &x, sizeof(tmp));
|
||||
|
||||
return tmp;
|
||||
}
|
||||
__device__ unsigned int __float_as_uint(float x)
|
||||
{
|
||||
hold32.f = x;
|
||||
return hold32.ui;
|
||||
static_assert(sizeof(unsigned int) == sizeof(float), "");
|
||||
|
||||
unsigned int tmp;
|
||||
__builtin_memcpy(&tmp, &x, sizeof(tmp));
|
||||
|
||||
return tmp;
|
||||
}
|
||||
__device__ double __hiloint2double(int hi, int lo)
|
||||
__device__ double __hiloint2double(int32_t hi, int32_t lo)
|
||||
{
|
||||
hold64.si[1] = hi;
|
||||
hold64.si[0] = lo;
|
||||
return hold64.d;
|
||||
static_assert(sizeof(double) == sizeof(uint64_t), "");
|
||||
|
||||
uint64_t tmp0 =
|
||||
(static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
|
||||
double tmp1;
|
||||
__builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
|
||||
return tmp1;
|
||||
}
|
||||
__device__ double __int2double_rn(int x)
|
||||
{
|
||||
@@ -262,8 +266,12 @@ __device__ float __int2float_rz(int x)
|
||||
|
||||
__device__ float __int_as_float(int x)
|
||||
{
|
||||
hold32.si = x;
|
||||
return hold32.f;
|
||||
static_assert(sizeof(float) == sizeof(int), "");
|
||||
|
||||
float tmp;
|
||||
__builtin_memcpy(&tmp, &x, sizeof(tmp));
|
||||
|
||||
return tmp;
|
||||
}
|
||||
|
||||
__device__ double __ll2double_rd(long long int x)
|
||||
@@ -302,8 +310,12 @@ __device__ float __ll2float_rz(long long int x)
|
||||
|
||||
__device__ double __longlong_as_double(long long int x)
|
||||
{
|
||||
hold64.sli = x;
|
||||
return hold64.d;
|
||||
static_assert(sizeof(double) == sizeof(long long), "");
|
||||
|
||||
double tmp;
|
||||
__builtin_memcpy(&tmp, &x, sizeof(tmp));
|
||||
|
||||
return x;
|
||||
}
|
||||
|
||||
__device__ double __uint2double_rn(int x)
|
||||
@@ -330,8 +342,12 @@ __device__ float __uint2float_rz(unsigned int x)
|
||||
|
||||
__device__ float __uint_as_float(unsigned int x)
|
||||
{
|
||||
hold32.ui = x;
|
||||
return hold32.f;
|
||||
static_assert(sizeof(float) == sizeof(unsigned int), "");
|
||||
|
||||
float tmp;
|
||||
__builtin_memcpy(&tmp, &x, sizeof(tmp));
|
||||
|
||||
return tmp;
|
||||
}
|
||||
|
||||
__device__ double __ull2double_rd(unsigned long long int x)
|
||||
|
||||
@@ -102,23 +102,48 @@ __device__ void* __hip_hc_free(void *ptr)
|
||||
// loop unrolling
|
||||
__device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size)
|
||||
{
|
||||
uint8_t *dstPtr, *srcPtr;
|
||||
dstPtr = (uint8_t*)dst;
|
||||
srcPtr = (uint8_t*)src;
|
||||
for(uint32_t i=0;i<size;i++) {
|
||||
dstPtr[i] = srcPtr[i];
|
||||
auto dstPtr = static_cast<uint8_t*>(dst);
|
||||
auto srcPtr = static_cast<const uint8_t*>(src);
|
||||
|
||||
while (size >= 4u) {
|
||||
dstPtr[0] = srcPtr[0];
|
||||
dstPtr[1] = srcPtr[1];
|
||||
dstPtr[2] = srcPtr[2];
|
||||
dstPtr[3] = srcPtr[3];
|
||||
|
||||
size -= 4u;
|
||||
srcPtr += 4u;
|
||||
dstPtr += 4u;
|
||||
}
|
||||
return nullptr;
|
||||
switch (size) {
|
||||
case 3: dstPtr[2] = srcPtr[2];
|
||||
case 2: dstPtr[1] = srcPtr[1];
|
||||
case 1: dstPtr[0] = srcPtr[0];
|
||||
}
|
||||
|
||||
return dst;
|
||||
}
|
||||
|
||||
__device__ void* __hip_hc_memset(void* ptr, uint8_t val, size_t size)
|
||||
__device__ void* __hip_hc_memset(void* dst, uint8_t val, size_t size)
|
||||
{
|
||||
uint8_t *dstPtr;
|
||||
dstPtr = (uint8_t*)ptr;
|
||||
for(uint32_t i=0;i<size;i++) {
|
||||
dstPtr[i] = val;
|
||||
auto dstPtr = static_cast<uint8_t*>(dst);
|
||||
|
||||
while (size >= 4u) {
|
||||
dstPtr[0] = val;
|
||||
dstPtr[1] = val;
|
||||
dstPtr[2] = val;
|
||||
dstPtr[3] = val;
|
||||
|
||||
size -= 4u;
|
||||
dstPtr += 4u;
|
||||
}
|
||||
return nullptr;
|
||||
switch (size) {
|
||||
case 3: dstPtr[2] = val;
|
||||
case 2: dstPtr[1] = val;
|
||||
case 1: dstPtr[0] = val;
|
||||
}
|
||||
|
||||
return dst;
|
||||
}
|
||||
|
||||
__device__ float __hip_erfinvf(float x){
|
||||
|
||||
@@ -0,0 +1,137 @@
|
||||
/*
|
||||
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "hip/hcc_detail/program_state.hpp"
|
||||
|
||||
#include "hip/hip_runtime_api.h"
|
||||
|
||||
// Internal header, do not percolate upwards.
|
||||
#include "hip_hcc_internal.h"
|
||||
#include "hc.hpp"
|
||||
#include "trace_helper.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <stdexcept>
|
||||
|
||||
#include <iostream>
|
||||
|
||||
using namespace hc;
|
||||
using namespace std;
|
||||
|
||||
namespace hip_impl
|
||||
{
|
||||
namespace
|
||||
{
|
||||
inline
|
||||
string name(uintptr_t function_address)
|
||||
{
|
||||
const auto it = function_names().find(function_address);
|
||||
|
||||
if (it == function_names().cend()) {
|
||||
throw runtime_error{
|
||||
"Invalid function passed to hipLaunchKernelGGL."};
|
||||
}
|
||||
|
||||
return it->second;
|
||||
}
|
||||
|
||||
inline
|
||||
string name(hsa_agent_t agent)
|
||||
{
|
||||
char n[64] = {};
|
||||
hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, n);
|
||||
|
||||
return string{n};
|
||||
}
|
||||
|
||||
inline
|
||||
hsa_agent_t target_agent(hipStream_t stream)
|
||||
{
|
||||
if (stream) {
|
||||
return *static_cast<hsa_agent_t*>(
|
||||
stream->locked_getAv()->get_hsa_agent());
|
||||
}
|
||||
else if (
|
||||
ihipGetTlsDefaultCtx() && ihipGetTlsDefaultCtx()->getDevice()) {
|
||||
return ihipGetDevice(
|
||||
ihipGetTlsDefaultCtx()->getDevice()->_deviceId)->_hsaAgent;
|
||||
}
|
||||
else {
|
||||
return *static_cast<hsa_agent_t*>(
|
||||
accelerator{}.get_default_view().get_hsa_agent());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void hipLaunchKernelGGLImpl(
|
||||
uintptr_t function_address,
|
||||
const dim3& numBlocks,
|
||||
const dim3& dimBlocks,
|
||||
uint32_t sharedMemBytes,
|
||||
hipStream_t stream,
|
||||
void** kernarg)
|
||||
{
|
||||
const auto it0 = functions().find(function_address);
|
||||
|
||||
if (it0 == functions().cend()) {
|
||||
throw runtime_error{
|
||||
"No device code available for function: " +
|
||||
name(function_address)
|
||||
};
|
||||
}
|
||||
|
||||
auto agent = target_agent(stream);
|
||||
|
||||
const auto it1 = find_if(
|
||||
it0->second.cbegin(),
|
||||
it0->second.cend(),
|
||||
[=](const pair<hsa_agent_t, Kernel_descriptor>& x) {
|
||||
return x.first.handle == agent.handle;
|
||||
});
|
||||
|
||||
if (it1 == it0->second.cend()) {
|
||||
throw runtime_error{
|
||||
"No code available for function: " + name(function_address) +
|
||||
", for agent: " + name(agent)
|
||||
};
|
||||
}
|
||||
|
||||
for (auto&& agent_kernel : it0->second) {
|
||||
if (agent.handle == agent_kernel.first.handle) {
|
||||
hipModuleLaunchKernel(
|
||||
agent_kernel.second,
|
||||
numBlocks.x,
|
||||
numBlocks.y,
|
||||
numBlocks.z,
|
||||
dimBlocks.x,
|
||||
dimBlocks.y,
|
||||
dimBlocks.z,
|
||||
sharedMemBytes,
|
||||
stream,
|
||||
nullptr,
|
||||
kernarg);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -22,78 +22,8 @@ THE SOFTWARE.
|
||||
|
||||
#include "hip/hcc_detail/grid_launch_GGL.hpp"
|
||||
|
||||
// Internal header, do not percolate upwards.
|
||||
#include "hip_hcc_internal.h"
|
||||
#include "hc.hpp"
|
||||
#include "trace_helper.h"
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
|
||||
namespace hip_impl
|
||||
{
|
||||
hc::accelerator_view lock_stream_hip_(
|
||||
hipStream_t& stream, void*& locked_stream)
|
||||
{ // This allocated but does not take ownership of locked_stream. If it is
|
||||
// not deleted elsewhere it will leak.
|
||||
using L = decltype(stream->lockopen_preKernelCommand());
|
||||
|
||||
HIP_INIT();
|
||||
|
||||
stream = ihipSyncAndResolveStream(stream);
|
||||
locked_stream = new L{stream->lockopen_preKernelCommand()};
|
||||
return (*static_cast<L*>(locked_stream))->_av;
|
||||
}
|
||||
|
||||
void print_prelaunch_trace_(
|
||||
const char* kernel_name,
|
||||
dim3 num_blocks,
|
||||
dim3 dim_blocks,
|
||||
int group_mem_bytes,
|
||||
hipStream_t stream)
|
||||
{
|
||||
if ((HIP_TRACE_API & (1 << TRACE_KCMD)) ||
|
||||
HIP_PROFILE_API ||
|
||||
(COMPILE_HIP_DB && (HIP_TRACE_API & (1<<TRACE_ALL)))) {
|
||||
std::stringstream os;
|
||||
os << tls_tidInfo.tid() << "." << tls_tidInfo.apiSeqNum()
|
||||
<< " hipLaunchKernel '" << kernel_name << "'"
|
||||
<< " gridDim:" << num_blocks
|
||||
<< " groupDim:" << dim_blocks
|
||||
<< " sharedMem:+" << group_mem_bytes
|
||||
<< " " << *stream;
|
||||
|
||||
if (HIP_PROFILE_API == 0x1) {
|
||||
std::string shortAtpString("hipLaunchKernel:");
|
||||
shortAtpString += kernel_name;
|
||||
MARKER_BEGIN(shortAtpString.c_str(), "HIP");
|
||||
} else if (HIP_PROFILE_API == 0x2) {
|
||||
MARKER_BEGIN(os.str().c_str(), "HIP");
|
||||
}
|
||||
|
||||
if (COMPILE_HIP_DB && HIP_TRACE_API) {
|
||||
std::string fullStr;
|
||||
recordApiTrace(&fullStr, os.str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void unlock_stream_hip_(
|
||||
hipStream_t stream,
|
||||
void* locked_stream,
|
||||
const char* kernel_name,
|
||||
hc::accelerator_view* acc_v)
|
||||
{ // Precondition: acc_v is the accelerator_view associated with stream
|
||||
// which is guarded by locked_stream;
|
||||
// locked_stream is deletable.
|
||||
using L = decltype(stream->lockopen_preKernelCommand());
|
||||
|
||||
stream->lockclose_postKernelCommand(kernel_name, acc_v);
|
||||
|
||||
delete static_cast<L*>(locked_stream);
|
||||
locked_stream = nullptr;
|
||||
if(HIP_PROFILE_API) {
|
||||
MARKER_END();
|
||||
}
|
||||
}
|
||||
}
|
||||
#if __hcc_workweek__ >= 17481
|
||||
#include "functional_grid_launch.inl"
|
||||
#else
|
||||
#include "macro_based_grid_launch.inl"
|
||||
#endif
|
||||
@@ -248,7 +248,7 @@ static const DbName dbName [] =
|
||||
#if COMPILE_HIP_DB
|
||||
#define tprintf(trace_level, ...) {\
|
||||
if (HIP_DB & (1<<(trace_level))) {\
|
||||
char msgStr[1000];\
|
||||
char msgStr[2000];\
|
||||
snprintf(msgStr, 2000, __VA_ARGS__);\
|
||||
fprintf (stderr, " %ship-%s tid:%d:%s%s", dbName[trace_level]._color, dbName[trace_level]._shortName, tls_tidInfo.tid(), msgStr, KNRM); \
|
||||
}\
|
||||
|
||||
@@ -808,6 +808,26 @@ hipError_t hipHostUnregister(void *hostPtr)
|
||||
return ihipLogStatus(hip_status);
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
inline
|
||||
hipDeviceptr_t agent_address_for_symbol(const char* symbolName)
|
||||
{
|
||||
hipDeviceptr_t r = nullptr;
|
||||
|
||||
#if __hcc_workweek__ >= 17481
|
||||
size_t byte_cnt = 0u;
|
||||
hipModuleGetGlobal(&r, &byte_cnt, 0, symbolName);
|
||||
#else
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
auto acc = ctx->getDevice()->_acc;
|
||||
r = acc.get_symbol_address(symbolName);
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
||||
hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t count, size_t offset, hipMemcpyKind kind)
|
||||
{
|
||||
HIP_INIT_SPECIAL_API((TRACE_MCMD), symbolName, src, count, offset, kind);
|
||||
@@ -821,7 +841,8 @@ hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t cou
|
||||
|
||||
hc::accelerator acc = ctx->getDevice()->_acc;
|
||||
|
||||
void *dst = acc.get_symbol_address((const char*) symbolName);
|
||||
hipDeviceptr_t dst =
|
||||
agent_address_for_symbol(static_cast<const char*>(symbolName));
|
||||
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst);
|
||||
|
||||
if(dst == nullptr)
|
||||
@@ -856,7 +877,8 @@ hipError_t hipMemcpyFromSymbol(void* dst, const void* symbolName, size_t count,
|
||||
|
||||
hc::accelerator acc = ctx->getDevice()->_acc;
|
||||
|
||||
void *src = acc.get_symbol_address((const char*) symbolName);
|
||||
hipDeviceptr_t src =
|
||||
agent_address_for_symbol(static_cast<const char*>(symbolName));
|
||||
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst);
|
||||
|
||||
if(dst == nullptr)
|
||||
@@ -893,7 +915,8 @@ hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void *src, size_
|
||||
|
||||
hc::accelerator acc = ctx->getDevice()->_acc;
|
||||
|
||||
void *dst = acc.get_symbol_address((const char*) symbolName);
|
||||
hipDeviceptr_t dst =
|
||||
agent_address_for_symbol(static_cast<const char*>(symbolName));
|
||||
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst);
|
||||
|
||||
if(dst == nullptr)
|
||||
@@ -931,7 +954,8 @@ hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName, size_t co
|
||||
|
||||
hc::accelerator acc = ctx->getDevice()->_acc;
|
||||
|
||||
void *src = acc.get_symbol_address((const char*) symbolName);
|
||||
hipDeviceptr_t src =
|
||||
agent_address_for_symbol(static_cast<const char*>(symbolName));
|
||||
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, src);
|
||||
|
||||
if(src == nullptr || dst == nullptr)
|
||||
@@ -1285,42 +1309,59 @@ hipError_t hipMemcpy3D(const struct hipMemcpy3DParms *p)
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
// TODO - make member function of stream?
|
||||
namespace
|
||||
{
|
||||
template<
|
||||
uint32_t block_dim,
|
||||
typename RandomAccessIterator,
|
||||
typename N,
|
||||
typename T>
|
||||
__global__
|
||||
void hip_fill_n(RandomAccessIterator f, N n, T value)
|
||||
{
|
||||
const uint32_t grid_dim = gridDim.x * blockDim.x;
|
||||
|
||||
size_t idx = blockIdx.x * block_dim + threadIdx.x;
|
||||
while (idx < n) {
|
||||
__builtin_memcpy(
|
||||
reinterpret_cast<void*>(&f[idx]),
|
||||
reinterpret_cast<const void*>(&value),
|
||||
sizeof(T));
|
||||
idx += grid_dim;
|
||||
}
|
||||
}
|
||||
|
||||
template<
|
||||
typename T,
|
||||
typename std::enable_if<std::is_integral<T>{}>::type* = nullptr>
|
||||
inline
|
||||
const T& clamp_integer(const T& x, const T& lower, const T& upper)
|
||||
{
|
||||
assert(!(upper < lower));
|
||||
|
||||
return std::min(upper, std::max(x, lower));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
ihipMemsetKernel(hipStream_t stream,
|
||||
LockedAccessor_StreamCrit_t &crit,
|
||||
T * ptr, T val, size_t sizeBytes,
|
||||
hc::completion_future *cf)
|
||||
T * ptr, T val, size_t sizeBytes)
|
||||
{
|
||||
int wg = std::min((unsigned)8, stream->getDevice()->_computeUnits);
|
||||
const int threads_per_wg = 256;
|
||||
static constexpr uint32_t block_dim = 256;
|
||||
|
||||
int threads = wg * threads_per_wg;
|
||||
if (threads > sizeBytes) {
|
||||
threads = ((sizeBytes + threads_per_wg - 1) / threads_per_wg) * threads_per_wg;
|
||||
}
|
||||
|
||||
|
||||
hc::extent<1> ext(threads);
|
||||
auto ext_tile = ext.tile(threads_per_wg);
|
||||
|
||||
*cf =
|
||||
hc::parallel_for_each(
|
||||
crit->_av,
|
||||
ext_tile,
|
||||
[=] (hc::tiled_index<1> idx)
|
||||
__attribute__((hc))
|
||||
{
|
||||
int offset = amp_get_global_id(0);
|
||||
// TODO-HCC - change to hc_get_local_size()
|
||||
int stride = amp_get_local_size(0) * hc_get_num_groups(0) ;
|
||||
|
||||
for (int i=offset; i<sizeBytes; i+=stride) {
|
||||
ptr[i] = val;
|
||||
}
|
||||
});
|
||||
const uint32_t grid_dim = clamp_integer<size_t>(
|
||||
sizeBytes / block_dim, 1, UINT32_MAX);
|
||||
|
||||
hipLaunchKernelGGL(
|
||||
hip_fill_n<block_dim>,
|
||||
dim3(grid_dim),
|
||||
dim3{block_dim},
|
||||
0u,
|
||||
stream,
|
||||
ptr,
|
||||
sizeBytes,
|
||||
std::move(val));
|
||||
}
|
||||
|
||||
|
||||
@@ -1334,17 +1375,12 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s
|
||||
stream = ihipSyncAndResolveStream(stream);
|
||||
|
||||
if (stream) {
|
||||
auto crit = stream->lockopen_preKernelCommand();
|
||||
|
||||
|
||||
hc::completion_future cf ;
|
||||
|
||||
if ((sizeBytes & 0x3) == 0) {
|
||||
// use a faster dword-per-workitem copy:
|
||||
try {
|
||||
value = value & 0xff;
|
||||
uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
|
||||
ihipMemsetKernel<uint32_t> (stream, crit, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t), &cf);
|
||||
ihipMemsetKernel<uint32_t> (stream, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t));
|
||||
}
|
||||
catch (std::exception &ex) {
|
||||
e = hipErrorInvalidValue;
|
||||
@@ -1352,19 +1388,16 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s
|
||||
} else {
|
||||
// use a slow byte-per-workitem copy:
|
||||
try {
|
||||
ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes, &cf);
|
||||
ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
|
||||
}
|
||||
catch (std::exception &ex) {
|
||||
e = hipErrorInvalidValue;
|
||||
}
|
||||
}
|
||||
|
||||
stream->lockclose_postKernelCommand("hipMemsetAsync", &crit->_av);
|
||||
|
||||
|
||||
if (HIP_API_BLOCKING) {
|
||||
tprintf (DB_SYNC, "%s LAUNCH_BLOCKING wait for hipMemsetAsync.\n", ToString(stream).c_str());
|
||||
cf.wait();
|
||||
stream->locked_wait();
|
||||
}
|
||||
} else {
|
||||
e = hipErrorInvalidValue;
|
||||
@@ -1385,16 +1418,12 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes)
|
||||
stream = ihipSyncAndResolveStream(stream);
|
||||
|
||||
if (stream) {
|
||||
auto crit = stream->lockopen_preKernelCommand();
|
||||
|
||||
hc::completion_future cf ;
|
||||
|
||||
if ((sizeBytes & 0x3) == 0) {
|
||||
// use a faster dword-per-workitem copy:
|
||||
try {
|
||||
value = value & 0xff;
|
||||
uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
|
||||
ihipMemsetKernel<uint32_t> (stream, crit, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t), &cf);
|
||||
ihipMemsetKernel<uint32_t> (stream, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t));
|
||||
}
|
||||
catch (std::exception &ex) {
|
||||
e = hipErrorInvalidValue;
|
||||
@@ -1402,21 +1431,18 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes)
|
||||
} else {
|
||||
// use a slow byte-per-workitem copy:
|
||||
try {
|
||||
ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes, &cf);
|
||||
ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
|
||||
}
|
||||
catch (std::exception &ex) {
|
||||
e = hipErrorInvalidValue;
|
||||
}
|
||||
}
|
||||
// TODO - is hipMemset supposed to be async?
|
||||
cf.wait();
|
||||
|
||||
stream->lockclose_postKernelCommand("hipMemset", &crit->_av);
|
||||
|
||||
stream->locked_wait();
|
||||
|
||||
if (HIP_LAUNCH_BLOCKING) {
|
||||
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset in %s.\n", __func__, ToString(stream).c_str());
|
||||
cf.wait();
|
||||
stream->locked_wait();
|
||||
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed in %s.\n", __func__, ToString(stream).c_str());
|
||||
}
|
||||
} else {
|
||||
@@ -1437,17 +1463,13 @@ hipError_t hipMemset2D(void* dst, size_t pitch, int value, size_t width, size_t
|
||||
stream = ihipSyncAndResolveStream(stream);
|
||||
|
||||
if (stream) {
|
||||
auto crit = stream->lockopen_preKernelCommand();
|
||||
|
||||
hc::completion_future cf ;
|
||||
|
||||
size_t sizeBytes = pitch * height;
|
||||
if ((sizeBytes & 0x3) == 0) {
|
||||
// use a faster dword-per-workitem copy:
|
||||
try {
|
||||
value = value & 0xff;
|
||||
uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
|
||||
ihipMemsetKernel<uint32_t> (stream, crit, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t), &cf);
|
||||
ihipMemsetKernel<uint32_t> (stream, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t));
|
||||
}
|
||||
catch (std::exception &ex) {
|
||||
e = hipErrorInvalidValue;
|
||||
@@ -1455,20 +1477,18 @@ hipError_t hipMemset2D(void* dst, size_t pitch, int value, size_t width, size_t
|
||||
} else {
|
||||
// use a slow byte-per-workitem copy:
|
||||
try {
|
||||
ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes, &cf);
|
||||
ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
|
||||
}
|
||||
catch (std::exception &ex) {
|
||||
e = hipErrorInvalidValue;
|
||||
}
|
||||
}
|
||||
// TODO - is hipMemset supposed to be async?
|
||||
cf.wait();
|
||||
|
||||
stream->lockclose_postKernelCommand("hipMemset", &crit->_av);
|
||||
stream->locked_wait();
|
||||
|
||||
if (HIP_LAUNCH_BLOCKING) {
|
||||
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset in %s.\n", __func__, ToString(stream).c_str());
|
||||
cf.wait();
|
||||
stream->locked_wait();
|
||||
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed in %s.\n", __func__, ToString(stream).c_str());
|
||||
}
|
||||
} else {
|
||||
@@ -1489,36 +1509,30 @@ hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t sizeByte
|
||||
stream = ihipSyncAndResolveStream(stream);
|
||||
|
||||
if (stream) {
|
||||
auto crit = stream->lockopen_preKernelCommand();
|
||||
|
||||
hc::completion_future cf ;
|
||||
|
||||
if ((sizeBytes & 0x3) == 0) {
|
||||
// use a faster dword-per-workitem copy:
|
||||
try {
|
||||
uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
|
||||
ihipMemsetKernel<uint32_t> (stream, crit, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t), &cf);
|
||||
ihipMemsetKernel<uint32_t> (stream, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t));
|
||||
}
|
||||
catch (std::exception &ex) {
|
||||
std::cout << ex.what() << std::endl;
|
||||
e = hipErrorInvalidValue;
|
||||
}
|
||||
} else {
|
||||
// use a slow byte-per-workitem copy:
|
||||
try {
|
||||
ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes, &cf);
|
||||
ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
|
||||
}
|
||||
catch (std::exception &ex) {
|
||||
e = hipErrorInvalidValue;
|
||||
}
|
||||
}
|
||||
cf.wait();
|
||||
|
||||
stream->lockclose_postKernelCommand("hipMemsetD8", &crit->_av);
|
||||
|
||||
stream->locked_wait();
|
||||
|
||||
if (HIP_LAUNCH_BLOCKING) {
|
||||
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset in %s.\n", __func__, ToString(stream).c_str());
|
||||
cf.wait();
|
||||
stream->locked_wait();
|
||||
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed in %s.\n", __func__, ToString(stream).c_str());
|
||||
}
|
||||
} else {
|
||||
@@ -1717,7 +1731,7 @@ hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* devPtr){
|
||||
HIP_INIT_API ( handle, devPtr);
|
||||
hipError_t hipStatus = hipSuccess;
|
||||
// Get the size of allocated pointer
|
||||
size_t psize;
|
||||
size_t psize = 0u;
|
||||
hc::accelerator acc;
|
||||
if((handle == NULL) || (devPtr == NULL)) {
|
||||
hipStatus = hipErrorInvalidResourceHandle;
|
||||
|
||||
@@ -37,6 +37,7 @@ THE SOFTWARE.
|
||||
|
||||
#include "elfio/elfio.hpp"
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "hip/hcc_detail/program_state.hpp"
|
||||
#include "hip_hcc_internal.h"
|
||||
#include "trace_helper.h"
|
||||
|
||||
@@ -165,159 +166,8 @@ uint64_t ElfSize(const void *emi){
|
||||
return total_size;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
template<typename P>
|
||||
inline
|
||||
ELFIO::section* find_section_if(ELFIO::elfio& reader, P p)
|
||||
{
|
||||
using namespace std;
|
||||
|
||||
const auto it = find_if(
|
||||
reader.sections.begin(), reader.sections.end(), move(p));
|
||||
|
||||
return it != reader.sections.end() ? *it : nullptr;
|
||||
}
|
||||
|
||||
inline
|
||||
std::vector<std::string> copy_names_of_undefined_symbols(
|
||||
const ELFIO::symbol_section_accessor& section)
|
||||
{
|
||||
using namespace ELFIO;
|
||||
using namespace std;
|
||||
|
||||
vector<string> r;
|
||||
|
||||
for (auto i = 0u; i != section.get_symbols_num(); ++i) {
|
||||
// TODO: this is boyscout code, caching the temporaries
|
||||
// may be of worth.
|
||||
string name;
|
||||
Elf64_Addr value = 0;
|
||||
Elf_Xword size = 0;
|
||||
Elf_Half sect_idx = 0;
|
||||
uint8_t bind = 0;
|
||||
uint8_t type = 0;
|
||||
uint8_t other = 0;
|
||||
|
||||
section.get_symbol(
|
||||
i, name, value, size, bind, type, sect_idx, other);
|
||||
|
||||
if (sect_idx == SHN_UNDEF && !name.empty()) {
|
||||
r.push_back(std::move(name));
|
||||
}
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
std::pair<ELFIO::Elf64_Addr, ELFIO::Elf_Xword> find_symbol_address(
|
||||
const ELFIO::symbol_section_accessor& section,
|
||||
const std::string& symbol_name)
|
||||
{
|
||||
using namespace ELFIO;
|
||||
using namespace std;
|
||||
|
||||
static const pair<Elf64_Addr, Elf_Xword> r{0, 0};
|
||||
|
||||
for (auto i = 0u; i != section.get_symbols_num(); ++i) {
|
||||
// TODO: this is boyscout code, caching the temporaries
|
||||
// may be of worth.
|
||||
string name;
|
||||
Elf64_Addr value = 0;
|
||||
Elf_Xword size = 0;
|
||||
Elf_Half sect_idx = 0;
|
||||
uint8_t bind = 0;
|
||||
uint8_t type = 0;
|
||||
uint8_t other = 0;
|
||||
|
||||
section.get_symbol(
|
||||
i, name, value, size, bind, type, sect_idx, other);
|
||||
|
||||
if (name == symbol_name) return make_pair(value, size);
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
void associate_code_object_symbols_with_host_allocation(
|
||||
hipModule_t module,
|
||||
const ELFIO::elfio& reader,
|
||||
const ELFIO::elfio& self_reader,
|
||||
ELFIO::section* code_object_dynsym,
|
||||
ELFIO::section* process_symtab,
|
||||
hsa_agent_t agent,
|
||||
hsa_executable_t executable)
|
||||
{
|
||||
using namespace ELFIO;
|
||||
using namespace std;
|
||||
|
||||
if (!code_object_dynsym || !process_symtab) return;
|
||||
|
||||
const auto undefined_symbols = copy_names_of_undefined_symbols(
|
||||
symbol_section_accessor{reader, code_object_dynsym});
|
||||
|
||||
for (auto&& x : undefined_symbols) {
|
||||
const auto tmp = find_symbol_address(
|
||||
symbol_section_accessor{self_reader, process_symtab}, x);
|
||||
|
||||
assert(tmp.first);
|
||||
void* p = nullptr;
|
||||
hsa_amd_memory_lock(
|
||||
reinterpret_cast<void*>(tmp.first), tmp.second, &agent, 1, &p);
|
||||
|
||||
hsa_executable_agent_global_variable_define(
|
||||
executable, agent, x.c_str(), p);
|
||||
|
||||
static vector<
|
||||
unique_ptr<void, decltype(hsa_amd_memory_unlock)*>> globals;
|
||||
static mutex mtx;
|
||||
|
||||
lock_guard<std::mutex> lck{mtx};
|
||||
globals.emplace_back(p, hsa_amd_memory_unlock);
|
||||
if (module->coGlobals.count(x) == 0) {
|
||||
module->coGlobals.emplace(x, tmp.first);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline
|
||||
void load_code_object_and_freeze_executable(
|
||||
const char* file, hsa_agent_t agent, hsa_executable_t executable)
|
||||
{ // TODO: the following sequence is inefficient, should be refactored
|
||||
// into a single load of the file and subsequent ELFIO
|
||||
// processing.
|
||||
using namespace std;
|
||||
|
||||
static const auto cor_deleter = [](hsa_code_object_reader_t* p) {
|
||||
hsa_code_object_reader_destroy(*p);
|
||||
};
|
||||
|
||||
using RAII_code_reader = unique_ptr<
|
||||
hsa_code_object_reader_t, decltype(cor_deleter)>;
|
||||
|
||||
unique_ptr<FILE, decltype(fclose)*> cobj{fopen(file, "r"), fclose};
|
||||
RAII_code_reader tmp{new hsa_code_object_reader_t, cor_deleter};
|
||||
hsa_code_object_reader_create_from_file(fileno(cobj.get()), tmp.get());
|
||||
|
||||
hsa_executable_load_agent_code_object(
|
||||
executable, agent, *tmp, nullptr, nullptr);
|
||||
|
||||
hsa_executable_freeze(executable, nullptr);
|
||||
|
||||
static vector<RAII_code_reader> code_readers;
|
||||
static mutex mtx;
|
||||
|
||||
lock_guard<mutex> lck{mtx};
|
||||
code_readers.push_back(move(tmp));
|
||||
}
|
||||
}
|
||||
|
||||
hipError_t hipModuleLoad(hipModule_t *module, const char *fname)
|
||||
{
|
||||
using namespace ELFIO;
|
||||
|
||||
HIP_INIT_API(module, fname);
|
||||
hipError_t ret = hipSuccess;
|
||||
*module = new ihipModule_t;
|
||||
@@ -340,37 +190,14 @@ hipError_t hipModuleLoad(hipModule_t *module, const char *fname)
|
||||
nullptr,
|
||||
&(*module)->executable);
|
||||
|
||||
elfio reader;
|
||||
if (!reader.load(fname)) {
|
||||
std::ifstream file{fname};
|
||||
|
||||
if (!file.is_open()) {
|
||||
return ihipLogStatus(hipErrorFileNotFound);
|
||||
}
|
||||
else {
|
||||
// TODO: this may benefit from caching as well.
|
||||
elfio self_reader;
|
||||
self_reader.load("/proc/self/exe");
|
||||
|
||||
const auto symtab =
|
||||
find_section_if(self_reader, [](const ELFIO::section* x) {
|
||||
return x->get_type() == SHT_SYMTAB;
|
||||
});
|
||||
|
||||
const auto code_object_dynsym =
|
||||
find_section_if(reader, [](const ELFIO::section* x) {
|
||||
return x->get_type() == SHT_DYNSYM;
|
||||
});
|
||||
|
||||
associate_code_object_symbols_with_host_allocation(
|
||||
*module,
|
||||
reader,
|
||||
self_reader,
|
||||
code_object_dynsym,
|
||||
symtab,
|
||||
currentDevice->_hsaAgent,
|
||||
(*module)->executable);
|
||||
|
||||
load_code_object_and_freeze_executable(
|
||||
fname, currentDevice->_hsaAgent, (*module)->executable);
|
||||
}
|
||||
(*module)->executable = hip_impl::load_executable(
|
||||
(*module)->executable, currentDevice->_hsaAgent, file);
|
||||
ret = (*module)->executable.handle ? hipSuccess : hipErrorUnknown;
|
||||
}
|
||||
|
||||
return ihipLogStatus(ret);
|
||||
@@ -725,16 +552,93 @@ namespace
|
||||
}
|
||||
|
||||
inline
|
||||
std::vector<Agent_global> read_agent_globals(hipModule_t hmodule)
|
||||
std::vector<Agent_global> read_agent_globals(
|
||||
hsa_agent_t agent, hsa_executable_t executable)
|
||||
{
|
||||
std::vector<Agent_global> r;
|
||||
|
||||
|
||||
hsa_executable_iterate_agent_symbols(
|
||||
hmodule->executable, this_agent(), copy_agent_global_variables, &r);
|
||||
executable, agent, copy_agent_global_variables, &r);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
template<typename ForwardIterator>
|
||||
std::pair<hipDeviceptr_t, std::size_t> read_global_description(
|
||||
ForwardIterator f, ForwardIterator l, const char* name)
|
||||
{
|
||||
const auto it = std::find_if(
|
||||
f, l, [=](const Agent_global& x) { return x.name == name; });
|
||||
|
||||
return it == l ?
|
||||
std::make_pair(nullptr, 0u) :
|
||||
std::make_pair(it->address, it->byte_cnt);
|
||||
}
|
||||
|
||||
hipError_t read_agent_global_from_module(
|
||||
hipDeviceptr_t *dptr,
|
||||
size_t* bytes,
|
||||
hipModule_t hmod,
|
||||
const char* name)
|
||||
{
|
||||
static std::unordered_map<
|
||||
hipModule_t, std::vector<Agent_global>> agent_globals;
|
||||
|
||||
// TODO: this is not particularly robust.
|
||||
if (agent_globals.count(hmod) == 0) {
|
||||
static std::mutex mtx;
|
||||
std::lock_guard<std::mutex> lck{mtx};
|
||||
|
||||
if (agent_globals.count(hmod) == 0) {
|
||||
agent_globals.emplace(
|
||||
hmod, read_agent_globals(this_agent(), hmod->executable));
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: This is unsafe iff some other emplacement triggers rehashing.
|
||||
// It will have to be properly fleshed out in the future.
|
||||
const auto it0 = agent_globals.find(hmod);
|
||||
if (it0 == agent_globals.cend()) {
|
||||
throw std::runtime_error{"agent_globals data structure corrupted."};
|
||||
}
|
||||
|
||||
std::tie(*dptr, *bytes) = read_global_description(
|
||||
it0->second.cbegin(), it0->second.cend(), name);
|
||||
|
||||
return dptr ? hipSuccess : hipErrorNotFound;
|
||||
}
|
||||
|
||||
hipError_t read_agent_global_from_process(
|
||||
hipDeviceptr_t *dptr, size_t* bytes, const char* name)
|
||||
{
|
||||
static std::unordered_map<
|
||||
hsa_agent_t, std::vector<Agent_global>> agent_globals;
|
||||
static std::once_flag f;
|
||||
|
||||
std::call_once(f, []() {
|
||||
for (auto&& agent_executables : hip_impl::executables()) {
|
||||
std::vector<Agent_global> tmp0;
|
||||
for (auto&& executable : agent_executables.second) {
|
||||
auto tmp1 = read_agent_globals(
|
||||
agent_executables.first, executable);
|
||||
tmp0.insert(
|
||||
tmp0.end(),
|
||||
std::make_move_iterator(tmp1.begin()),
|
||||
std::make_move_iterator(tmp1.end()));
|
||||
}
|
||||
agent_globals.emplace(agent_executables.first, std::move(tmp0));
|
||||
}
|
||||
});
|
||||
|
||||
const auto it = agent_globals.find(this_agent());
|
||||
|
||||
if (it == agent_globals.cend()) return hipErrorNotInitialized;
|
||||
|
||||
std::tie(*dptr, *bytes) = read_global_description(
|
||||
it->second.cbegin(), it->second.cend(), name);
|
||||
|
||||
return dptr ? hipSuccess : hipErrorNotFound;
|
||||
}
|
||||
}
|
||||
|
||||
hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes,
|
||||
@@ -745,41 +649,15 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes,
|
||||
if(dptr == NULL || bytes == NULL){
|
||||
return ihipLogStatus(hipErrorInvalidValue);
|
||||
}
|
||||
if(name == NULL || hmod == NULL){
|
||||
if(name == NULL){
|
||||
return ihipLogStatus(hipErrorNotInitialized);
|
||||
}
|
||||
else{
|
||||
static std::unordered_map<
|
||||
hipModule_t, std::vector<Agent_global>> agent_globals;
|
||||
ret = hmod ?
|
||||
read_agent_global_from_module(dptr, bytes, hmod, name) :
|
||||
read_agent_global_from_process(dptr, bytes, name);
|
||||
|
||||
// TODO: this is not particularly robust.
|
||||
if (agent_globals.count(hmod) == 0) {
|
||||
static std::mutex mtx;
|
||||
std::lock_guard<std::mutex> lck{mtx};
|
||||
|
||||
if (agent_globals.count(hmod) == 0) {
|
||||
agent_globals.emplace(hmod, read_agent_globals(hmod));
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: This is unsafe iff some other emplacement triggers rehashing.
|
||||
// It will have to be properly fleshed out in the future.
|
||||
const auto it0 = agent_globals.find(hmod);
|
||||
if (it0 == agent_globals.cend()) {
|
||||
throw std::runtime_error{"agent_globals data structure corrupted."};
|
||||
}
|
||||
|
||||
const auto it1 = std::find_if(
|
||||
it0->second.cbegin(),
|
||||
it0->second.cend(),
|
||||
[=](const Agent_global& x) { return x.name == name; });
|
||||
|
||||
if (it1 == it0->second.cend()) return ihipLogStatus(hipErrorNotFound);
|
||||
|
||||
*dptr = it1->address;
|
||||
*bytes = it1->byte_cnt;
|
||||
|
||||
return ihipLogStatus(hipSuccess);
|
||||
return ihipLogStatus(ret);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -848,9 +726,9 @@ hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const
|
||||
if(name == NULL || hmod == NULL){
|
||||
ret = hipErrorNotInitialized;
|
||||
} else{
|
||||
const auto it = hmod->coGlobals.find(name);
|
||||
if (it == hmod->coGlobals.end()) return ihipLogStatus(hipErrorInvalidValue);
|
||||
*texRef = reinterpret_cast<textureReference*>(it->second);
|
||||
const auto it = hip_impl::globals().find(name);
|
||||
if (it == hip_impl::globals().end()) return ihipLogStatus(hipErrorInvalidValue);
|
||||
*texRef = reinterpret_cast<textureReference*>(it->second.get());
|
||||
ret = hipSuccess;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,97 @@
|
||||
/*
|
||||
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
// Internal header, do not percolate upwards.
|
||||
#include "hip_hcc_internal.h"
|
||||
#include "hc.hpp"
|
||||
#include "trace_helper.h"
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
|
||||
namespace hip_impl
|
||||
{
|
||||
hc::accelerator_view lock_stream_hip_(
|
||||
hipStream_t& stream, void*& locked_stream)
|
||||
{ // This allocated but does not take ownership of locked_stream. If it is
|
||||
// not deleted elsewhere it will leak.
|
||||
using L = decltype(stream->lockopen_preKernelCommand());
|
||||
|
||||
HIP_INIT();
|
||||
|
||||
stream = ihipSyncAndResolveStream(stream);
|
||||
locked_stream = new L{stream->lockopen_preKernelCommand()};
|
||||
return (*static_cast<L*>(locked_stream))->_av;
|
||||
}
|
||||
|
||||
void print_prelaunch_trace_(
|
||||
const char* kernel_name,
|
||||
dim3 num_blocks,
|
||||
dim3 dim_blocks,
|
||||
int group_mem_bytes,
|
||||
hipStream_t stream)
|
||||
{
|
||||
if ((HIP_TRACE_API & (1 << TRACE_KCMD)) ||
|
||||
HIP_PROFILE_API ||
|
||||
(COMPILE_HIP_DB && (HIP_TRACE_API & (1<<TRACE_ALL)))) {
|
||||
std::stringstream os;
|
||||
os << tls_tidInfo.tid() << "." << tls_tidInfo.apiSeqNum()
|
||||
<< " hipLaunchKernel '" << kernel_name << "'"
|
||||
<< " gridDim:" << num_blocks
|
||||
<< " groupDim:" << dim_blocks
|
||||
<< " sharedMem:+" << group_mem_bytes
|
||||
<< " " << *stream;
|
||||
|
||||
if (HIP_PROFILE_API == 0x1) {
|
||||
std::string shortAtpString("hipLaunchKernel:");
|
||||
shortAtpString += kernel_name;
|
||||
MARKER_BEGIN(shortAtpString.c_str(), "HIP");
|
||||
} else if (HIP_PROFILE_API == 0x2) {
|
||||
MARKER_BEGIN(os.str().c_str(), "HIP");
|
||||
}
|
||||
|
||||
if (COMPILE_HIP_DB && HIP_TRACE_API) {
|
||||
std::string fullStr;
|
||||
recordApiTrace(&fullStr, os.str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void unlock_stream_hip_(
|
||||
hipStream_t stream,
|
||||
void* locked_stream,
|
||||
const char* kernel_name,
|
||||
hc::accelerator_view* acc_v)
|
||||
{ // Precondition: acc_v is the accelerator_view associated with stream
|
||||
// which is guarded by locked_stream;
|
||||
// locked_stream is deletable.
|
||||
using L = decltype(stream->lockopen_preKernelCommand());
|
||||
|
||||
stream->lockclose_postKernelCommand(kernel_name, acc_v);
|
||||
|
||||
delete static_cast<L*>(locked_stream);
|
||||
locked_stream = nullptr;
|
||||
if(HIP_PROFILE_API) {
|
||||
MARKER_END();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,567 @@
|
||||
#include "../include/hip/hcc_detail/program_state.hpp"
|
||||
|
||||
#include "../include/hip/hcc_detail/code_object_bundle.hpp"
|
||||
|
||||
#include "hip_hcc_internal.h"
|
||||
#include "trace_helper.h"
|
||||
|
||||
#include "elfio/elfio.hpp"
|
||||
|
||||
#include <link.h>
|
||||
|
||||
#include <hsa/hsa.h>
|
||||
#include <hsa/hsa_ext_amd.h>
|
||||
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
using namespace ELFIO;
|
||||
using namespace hip_impl;
|
||||
using namespace std;
|
||||
|
||||
namespace std
|
||||
{
|
||||
template<>
|
||||
struct hash<hsa_isa_t> {
|
||||
size_t operator()(hsa_isa_t x) const
|
||||
{
|
||||
return hash<decltype(x.handle)>{}(x.handle);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
inline
|
||||
constexpr
|
||||
bool operator==(hsa_isa_t x, hsa_isa_t y)
|
||||
{
|
||||
return x.handle == y.handle;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
struct Symbol {
|
||||
std::string name;
|
||||
ELFIO::Elf64_Addr value = 0;
|
||||
ELFIO::Elf_Xword size = 0;
|
||||
ELFIO::Elf_Half sect_idx = 0;
|
||||
std::uint8_t bind = 0;
|
||||
std::uint8_t type = 0;
|
||||
std::uint8_t other = 0;
|
||||
};
|
||||
|
||||
inline
|
||||
Symbol read_symbol(const symbol_section_accessor& section, unsigned int idx)
|
||||
{
|
||||
assert(idx < section.get_symbols_num());
|
||||
|
||||
Symbol r;
|
||||
section.get_symbol(
|
||||
idx, r.name, r.value, r.size, r.bind, r.type, r.sect_idx, r.other);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
template<typename P>
|
||||
inline
|
||||
section* find_section_if(elfio& reader, P p)
|
||||
{
|
||||
const auto it = find_if(
|
||||
reader.sections.begin(), reader.sections.end(), move(p));
|
||||
|
||||
return it != reader.sections.end() ? *it : nullptr;
|
||||
}
|
||||
|
||||
vector<string> copy_names_of_undefined_symbols(
|
||||
const symbol_section_accessor& section)
|
||||
{
|
||||
vector<string> r;
|
||||
|
||||
for (auto i = 0u; i != section.get_symbols_num(); ++i) {
|
||||
// TODO: this is boyscout code, caching the temporaries
|
||||
// may be of worth.
|
||||
|
||||
auto tmp = read_symbol(section, i);
|
||||
if (tmp.sect_idx == SHN_UNDEF && !tmp.name.empty()) {
|
||||
r.push_back(std::move(tmp.name));
|
||||
}
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
const std::unordered_map<
|
||||
std::string,
|
||||
std::pair<ELFIO::Elf64_Addr, ELFIO::Elf_Xword>>& symbol_addresses()
|
||||
{
|
||||
static unordered_map<string, pair<Elf64_Addr, Elf_Xword>> r;
|
||||
static once_flag f;
|
||||
|
||||
call_once(f, []() {
|
||||
dl_iterate_phdr([](dl_phdr_info* info, size_t, void*) {
|
||||
static constexpr const char self[] = "/proc/self/exe";
|
||||
elfio reader;
|
||||
|
||||
static unsigned int iter = 0u;
|
||||
if (reader.load(!iter ? self : info->dlpi_name)) {
|
||||
auto it = find_section_if(
|
||||
reader, [](const class section* x) {
|
||||
return x->get_type() == SHT_SYMTAB;
|
||||
});
|
||||
|
||||
if (it) {
|
||||
const symbol_section_accessor symtab{reader, it};
|
||||
|
||||
for (auto i = 0u; i != symtab.get_symbols_num(); ++i) {
|
||||
auto tmp = read_symbol(symtab, i);
|
||||
|
||||
if (tmp.type == STT_OBJECT &&
|
||||
tmp.sect_idx != SHN_UNDEF) {
|
||||
const auto addr =
|
||||
tmp.value + (iter ? info->dlpi_addr : 0);
|
||||
r.emplace(
|
||||
move(tmp.name), make_pair(addr, tmp.size));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
++iter;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}, nullptr);
|
||||
});
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
void associate_code_object_symbols_with_host_allocation(
|
||||
const elfio& reader,
|
||||
const elfio& self_reader,
|
||||
section* code_object_dynsym,
|
||||
section* process_symtab,
|
||||
hsa_agent_t agent,
|
||||
hsa_executable_t executable)
|
||||
{
|
||||
if (!code_object_dynsym || !process_symtab) return;
|
||||
|
||||
const auto undefined_symbols = copy_names_of_undefined_symbols(
|
||||
symbol_section_accessor{reader, code_object_dynsym});
|
||||
|
||||
for (auto&& x : undefined_symbols) {
|
||||
if (globals().find(x) != globals().cend()) return;
|
||||
|
||||
const auto it1 = symbol_addresses().find(x);
|
||||
|
||||
if (it1 == symbol_addresses().cend()) {
|
||||
throw runtime_error{"Global symbol: " + x + " is undefined."};
|
||||
}
|
||||
|
||||
static mutex mtx;
|
||||
lock_guard<mutex> lck{mtx};
|
||||
|
||||
if (globals().find(x) != globals().cend()) return;
|
||||
|
||||
void* p = nullptr;
|
||||
hsa_amd_memory_lock(
|
||||
reinterpret_cast<void*>(it1->second.first),
|
||||
it1->second.second,
|
||||
nullptr, // All agents.
|
||||
0,
|
||||
&p);
|
||||
|
||||
hsa_executable_agent_global_variable_define(
|
||||
executable, agent, x.c_str(), p);
|
||||
|
||||
globals().emplace(x, RAII_global{p, hsa_amd_memory_unlock});
|
||||
}
|
||||
}
|
||||
|
||||
vector<uint8_t> code_object_blob_for_process()
|
||||
{
|
||||
static constexpr const char self[] = "/proc/self/exe";
|
||||
static constexpr const char kernel_section[] = ".kernel";
|
||||
|
||||
elfio reader;
|
||||
|
||||
if (!reader.load(self)) {
|
||||
throw runtime_error{"Failed to load ELF file for current process."};
|
||||
}
|
||||
|
||||
auto kernels = find_section_if(reader, [](const section* x) {
|
||||
return x->get_name() == kernel_section;
|
||||
});
|
||||
|
||||
vector<uint8_t> r;
|
||||
if (kernels) {
|
||||
r.insert(
|
||||
r.end(),
|
||||
kernels->get_data(),
|
||||
kernels->get_data() + kernels->get_size());
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
const unordered_map<hsa_isa_t, vector<vector<uint8_t>>>& code_object_blobs()
|
||||
{
|
||||
static unordered_map<hsa_isa_t, vector<vector<uint8_t>>> r;
|
||||
static once_flag f;
|
||||
|
||||
call_once(f, []() {
|
||||
static vector<vector<uint8_t>> blobs{
|
||||
code_object_blob_for_process()};
|
||||
|
||||
dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void*) {
|
||||
elfio tmp;
|
||||
if (tmp.load(info->dlpi_name)) {
|
||||
const auto it = find_section_if(tmp, [](const section* x) {
|
||||
return x->get_name() == ".kernel";
|
||||
});
|
||||
|
||||
if (it) blobs.emplace_back(
|
||||
it->get_data(), it->get_data() + it->get_size());
|
||||
}
|
||||
return 0;
|
||||
}, nullptr);
|
||||
|
||||
for (auto&& blob : blobs) {
|
||||
Bundled_code_header tmp{blob};
|
||||
if (valid(tmp)) {
|
||||
for (auto&& bundle : bundles(tmp)) {
|
||||
r[triple_to_hsa_isa(bundle.triple)].push_back(
|
||||
bundle.blob);
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
vector<pair<uintptr_t, string>> function_names_for(
|
||||
const elfio& reader, section* symtab)
|
||||
{
|
||||
vector<pair<uintptr_t, string>> r;
|
||||
symbol_section_accessor symbols{reader, symtab};
|
||||
|
||||
for (auto i = 0u; i != symbols.get_symbols_num(); ++i) {
|
||||
// TODO: this is boyscout code, caching the temporaries
|
||||
// may be of worth.
|
||||
auto tmp = read_symbol(symbols, i);
|
||||
|
||||
if (tmp.type == STT_FUNC &&
|
||||
tmp.sect_idx != SHN_UNDEF &&
|
||||
!tmp.name.empty()) {
|
||||
r.emplace_back(tmp.value, tmp.name);
|
||||
}
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
const vector<pair<uintptr_t, string>>& function_names_for_process()
|
||||
{
|
||||
static constexpr const char self[] = "/proc/self/exe";
|
||||
|
||||
static vector<pair<uintptr_t, string>> r;
|
||||
static once_flag f;
|
||||
|
||||
call_once(f, []() {
|
||||
elfio reader;
|
||||
|
||||
if (!reader.load(self)) {
|
||||
throw runtime_error{
|
||||
"Failed to load the ELF file for the current process."};
|
||||
}
|
||||
|
||||
auto symtab = find_section_if(reader, [](const section* x) {
|
||||
return x->get_type() == SHT_SYMTAB;
|
||||
});
|
||||
|
||||
if (symtab) r = function_names_for(reader, symtab);
|
||||
});
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
hsa_agent_t agent(hsa_executable_symbol_t x)
|
||||
{
|
||||
hsa_agent_t r = {};
|
||||
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_AGENT, &r);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
uint32_t group_size(hsa_executable_symbol_t x)
|
||||
{
|
||||
uint32_t r = 0u;
|
||||
hsa_executable_symbol_get_info(
|
||||
x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &r);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
uint64_t kernel_object(hsa_executable_symbol_t x)
|
||||
{
|
||||
uint64_t r = 0u;
|
||||
hsa_executable_symbol_get_info(
|
||||
x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &r);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
string name(hsa_executable_symbol_t x)
|
||||
{
|
||||
uint32_t sz = 0u;
|
||||
hsa_executable_symbol_get_info(
|
||||
x, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &sz);
|
||||
|
||||
string r(sz, '\0');
|
||||
hsa_executable_symbol_get_info(
|
||||
x, HSA_EXECUTABLE_SYMBOL_INFO_NAME, &r.front());
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
uint32_t private_size(hsa_executable_symbol_t x)
|
||||
{
|
||||
uint32_t r = 0u;
|
||||
hsa_executable_symbol_get_info(
|
||||
x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &r);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
hsa_symbol_kind_t type(hsa_executable_symbol_t x)
|
||||
{
|
||||
hsa_symbol_kind_t r = {};
|
||||
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &r);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
const unordered_map<string, vector<hsa_executable_symbol_t>>& kernels()
|
||||
{
|
||||
static unordered_map<string, vector<hsa_executable_symbol_t>> r;
|
||||
static once_flag f;
|
||||
|
||||
call_once(f, []() {
|
||||
static const auto copy_kernels = [](
|
||||
hsa_executable_t, hsa_agent_t, hsa_executable_symbol_t s, void*) {
|
||||
if (type(s) == HSA_SYMBOL_KIND_KERNEL) r[name(s)].push_back(s);
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
};
|
||||
|
||||
for (auto&& agent_executables : executables()) {
|
||||
for (auto&& executable : agent_executables.second) {
|
||||
hsa_executable_iterate_agent_symbols(
|
||||
executable,
|
||||
agent_executables.first,
|
||||
copy_kernels,
|
||||
nullptr);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
void load_code_object_and_freeze_executable(
|
||||
istream& file, hsa_agent_t agent, hsa_executable_t executable)
|
||||
{ // TODO: the following sequence is inefficient, should be refactored
|
||||
// into a single load of the file and subsequent ELFIO
|
||||
// processing.
|
||||
static const auto cor_deleter = [](hsa_code_object_reader_t* p) {
|
||||
hsa_code_object_reader_destroy(*p);
|
||||
};
|
||||
|
||||
using RAII_code_reader = unique_ptr<
|
||||
hsa_code_object_reader_t, decltype(cor_deleter)>;
|
||||
|
||||
file.seekg(0);
|
||||
|
||||
vector<uint8_t> blob{
|
||||
istreambuf_iterator<char>{file}, istreambuf_iterator<char>{}};
|
||||
RAII_code_reader tmp{new hsa_code_object_reader_t, cor_deleter};
|
||||
hsa_code_object_reader_create_from_memory(
|
||||
blob.data(), blob.size(), tmp.get());
|
||||
|
||||
hsa_executable_load_agent_code_object(
|
||||
executable, agent, *tmp, nullptr, nullptr);
|
||||
|
||||
hsa_executable_freeze(executable, nullptr);
|
||||
|
||||
static vector<RAII_code_reader> code_readers;
|
||||
static mutex mtx;
|
||||
|
||||
lock_guard<mutex> lck{mtx};
|
||||
code_readers.push_back(move(tmp));
|
||||
}
|
||||
}
|
||||
|
||||
namespace hip_impl
|
||||
{
|
||||
const unordered_map<hsa_agent_t, vector<hsa_executable_t>>& executables()
|
||||
{
|
||||
static unordered_map<hsa_agent_t, vector<hsa_executable_t>> r;
|
||||
static once_flag f;
|
||||
|
||||
call_once(f, []() {
|
||||
static const auto accelerators = hc::accelerator::get_all();
|
||||
|
||||
for (auto&& acc : accelerators) {
|
||||
auto agent = static_cast<hsa_agent_t*>(acc.get_hsa_agent());
|
||||
|
||||
if (!agent || !acc.is_hsa_accelerator()) continue;
|
||||
|
||||
hsa_agent_iterate_isas(*agent, [](hsa_isa_t x, void* pa) {
|
||||
const auto it = code_object_blobs().find(x);
|
||||
|
||||
if (it != code_object_blobs().cend()) {
|
||||
hsa_agent_t a = *static_cast<hsa_agent_t*>(pa);
|
||||
|
||||
for (auto&& blob : it->second) {
|
||||
hsa_executable_t tmp = {};
|
||||
|
||||
hsa_executable_create_alt(
|
||||
HSA_PROFILE_FULL,
|
||||
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
|
||||
nullptr,
|
||||
&tmp);
|
||||
|
||||
// TODO: this is massively inefficient and only
|
||||
// meant for illustration.
|
||||
string blob_to_str{blob.cbegin(), blob.cend()};
|
||||
stringstream istr{blob_to_str};
|
||||
tmp = load_executable(tmp, a, istr);
|
||||
|
||||
if (tmp.handle) r[a].push_back(tmp);
|
||||
}
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}, agent);
|
||||
}
|
||||
});
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
const unordered_map<uintptr_t, string>& function_names()
|
||||
{
|
||||
static unordered_map<uintptr_t, string> r{
|
||||
function_names_for_process().cbegin(),
|
||||
function_names_for_process().cend()};
|
||||
static once_flag f;
|
||||
|
||||
call_once(f, []() {
|
||||
dl_iterate_phdr([](dl_phdr_info* info, size_t, void*) {
|
||||
elfio tmp;
|
||||
if (tmp.load(info->dlpi_name)) {
|
||||
const auto it = find_section_if(tmp, [](const section* x) {
|
||||
return x->get_type() == SHT_SYMTAB;
|
||||
});
|
||||
|
||||
if (it) {
|
||||
auto n = function_names_for(tmp, it);
|
||||
|
||||
for (auto&& f : n) f.first += info->dlpi_addr;
|
||||
|
||||
r.insert(
|
||||
make_move_iterator(n.begin()),
|
||||
make_move_iterator(n.end()));
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}, nullptr);
|
||||
});
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
const unordered_map<
|
||||
uintptr_t, vector<pair<hsa_agent_t, Kernel_descriptor>>>& functions()
|
||||
{
|
||||
static unordered_map<
|
||||
uintptr_t, vector<pair<hsa_agent_t, Kernel_descriptor>>> r;
|
||||
static once_flag f;
|
||||
|
||||
call_once(f, []() {
|
||||
for (auto&& function : function_names()) {
|
||||
const auto it = kernels().find(function.second);
|
||||
|
||||
if (it != kernels().cend()) {
|
||||
for (auto&& kernel_symbol : it->second) {
|
||||
r[function.first].emplace_back(
|
||||
agent(kernel_symbol),
|
||||
Kernel_descriptor{
|
||||
kernel_object(kernel_symbol),
|
||||
group_size(kernel_symbol),
|
||||
private_size(kernel_symbol),
|
||||
it->first});
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
unordered_map<string, RAII_global>& globals()
|
||||
{
|
||||
static unordered_map<string, RAII_global> r;
|
||||
static once_flag f;
|
||||
call_once(f, []() { r.reserve(symbol_addresses().size()); });
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
hsa_executable_t load_executable(
|
||||
hsa_executable_t executable, hsa_agent_t agent, istream& file)
|
||||
{
|
||||
elfio reader;
|
||||
if (!reader.load(file)) {
|
||||
return hsa_executable_t{};
|
||||
}
|
||||
else {
|
||||
// TODO: this may benefit from caching as well.
|
||||
elfio self_reader;
|
||||
self_reader.load("/proc/self/exe");
|
||||
|
||||
const auto symtab =
|
||||
find_section_if(self_reader, [](const ELFIO::section* x) {
|
||||
return x->get_type() == SHT_SYMTAB;
|
||||
});
|
||||
|
||||
const auto code_object_dynsym =
|
||||
find_section_if(reader, [](const ELFIO::section* x) {
|
||||
return x->get_type() == SHT_DYNSYM;
|
||||
});
|
||||
|
||||
associate_code_object_symbols_with_host_allocation(
|
||||
reader, self_reader, code_object_dynsym, symtab, agent, executable);
|
||||
|
||||
load_code_object_and_freeze_executable(file, agent, executable);
|
||||
|
||||
return executable;
|
||||
}
|
||||
}
|
||||
} // Namespace hip_impl.
|
||||
@@ -23,7 +23,7 @@ __global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In)
|
||||
__global__ void set(hipLaunchParm lp, uint32_t *ptr, uint8_t val, size_t size)
|
||||
{
|
||||
int tx = threadIdx.x;
|
||||
memset(ptr + tx, val, (sizeof(uint32_t)*(size/LEN)));
|
||||
memset(ptr + tx, val, sizeof(uint32_t));
|
||||
}
|
||||
|
||||
int main()
|
||||
|
||||
@@ -159,11 +159,16 @@ bool dataTypesRun(){
|
||||
HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice));
|
||||
|
||||
|
||||
hipLaunchKernel(vectoradd_float,
|
||||
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
|
||||
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
|
||||
0, 0,
|
||||
deviceA ,deviceB ,WIDTH ,HEIGHT);
|
||||
hipLaunchKernel(
|
||||
vectoradd_float,
|
||||
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
|
||||
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
|
||||
0,
|
||||
0,
|
||||
deviceA,
|
||||
static_cast<const T*>(deviceB),
|
||||
WIDTH,
|
||||
HEIGHT);
|
||||
|
||||
|
||||
HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(T), hipMemcpyDeviceToHost));
|
||||
@@ -221,11 +226,16 @@ bool dataTypesRun2(){
|
||||
HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T)));
|
||||
|
||||
HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice));
|
||||
hipLaunchKernel(vectoradd_float,
|
||||
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
|
||||
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
|
||||
0, 0,
|
||||
deviceA ,deviceB,WIDTH ,HEIGHT);
|
||||
hipLaunchKernel(
|
||||
vectoradd_float,
|
||||
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
|
||||
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
|
||||
0,
|
||||
0,
|
||||
deviceA,
|
||||
static_cast<const T*>(deviceB),
|
||||
WIDTH,
|
||||
HEIGHT);
|
||||
|
||||
|
||||
HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(T), hipMemcpyDeviceToHost));
|
||||
@@ -281,11 +291,16 @@ bool dataTypesRun4(){
|
||||
HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice));
|
||||
|
||||
|
||||
hipLaunchKernel(vectoradd_float,
|
||||
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
|
||||
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
|
||||
0, 0,
|
||||
deviceA ,deviceB ,WIDTH ,HEIGHT);
|
||||
hipLaunchKernel(
|
||||
vectoradd_float,
|
||||
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
|
||||
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
|
||||
0,
|
||||
0,
|
||||
deviceA,
|
||||
static_cast<const T*>(deviceB),
|
||||
WIDTH,
|
||||
HEIGHT);
|
||||
|
||||
|
||||
HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(T), hipMemcpyDeviceToHost));
|
||||
|
||||
@@ -69,7 +69,16 @@ int main(int argc, char *argv[])
|
||||
// Record the start event
|
||||
HIPCHECK (hipEventRecord(start, NULL));
|
||||
|
||||
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N);
|
||||
hipLaunchKernel(
|
||||
HipTest::vectorADD,
|
||||
dim3(blocks),
|
||||
dim3(threadsPerBlock),
|
||||
0,
|
||||
0,
|
||||
static_cast<const float*>(A_d),
|
||||
static_cast<const float*>(B_d),
|
||||
C_d,
|
||||
N);
|
||||
|
||||
|
||||
HIPCHECK (hipEventRecord(stop, NULL));
|
||||
|
||||
@@ -77,7 +77,16 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_
|
||||
HIPCHECK(hipEventRecord(timingDisabled, stream));
|
||||
// sandwhich a kernel:
|
||||
HIPCHECK(hipEventRecord(start, stream));
|
||||
hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, stream, C_d, C_h, numElements, count);
|
||||
hipLaunchKernelGGL(
|
||||
HipTest::addCountReverse,
|
||||
dim3(blocks),
|
||||
dim3(threadsPerBlock),
|
||||
0,
|
||||
stream,
|
||||
static_cast<const int*>(C_d),
|
||||
C_h,
|
||||
numElements,
|
||||
count);
|
||||
HIPCHECK(hipEventRecord(stop, stream));
|
||||
|
||||
|
||||
|
||||
@@ -243,7 +243,16 @@ void memcpytest2(DeviceMemory<T> *dmem, HostMemory<T> *hmem, size_t numElements,
|
||||
HIPCHECK ( hipMemcpy(dmem->B_d(), hmem->B_h(), sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, dmem->A_d(), dmem->B_d(), dmem->C_d(), numElements);
|
||||
hipLaunchKernel(
|
||||
HipTest::vectorADD,
|
||||
dim3(blocks),
|
||||
dim3(threadsPerBlock),
|
||||
0,
|
||||
0,
|
||||
static_cast<const T*>(dmem->A_d()),
|
||||
static_cast<const T*>(dmem->B_d()),
|
||||
dmem->C_d(),
|
||||
numElements);
|
||||
|
||||
if (useDeviceToDevice) {
|
||||
// Do an extra device-to-device copy here to mix things up:
|
||||
|
||||
@@ -49,21 +49,39 @@ int main()
|
||||
HIPCHECK(hipMalloc(&Y_d,Nbytes));
|
||||
HIPCHECK(hipMalloc(&Z_d,Nbytes));
|
||||
|
||||
|
||||
|
||||
HIPCHECK(hipSetDevice(0));
|
||||
HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
|
||||
HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
|
||||
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N);
|
||||
hipLaunchKernel(
|
||||
HipTest::vectorADD,
|
||||
dim3(blocks),
|
||||
dim3(threadsPerBlock),
|
||||
0,
|
||||
0,
|
||||
static_cast<const int*>(A_d),
|
||||
static_cast<const int*>(B_d),
|
||||
C_d,
|
||||
N);
|
||||
HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
HipTest::checkVectorADD(A_h, B_h, C_h, N);
|
||||
|
||||
|
||||
|
||||
|
||||
HIPCHECK(hipSetDevice(1));
|
||||
HIPCHECK(hipMemcpyDtoD(X_d, A_d, Nbytes));
|
||||
HIPCHECK(hipMemcpyDtoD(X_d, A_d, Nbytes));
|
||||
HIPCHECK(hipMemcpyDtoD(Y_d, B_d, Nbytes));
|
||||
|
||||
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N);
|
||||
hipLaunchKernel(
|
||||
HipTest::vectorADD,
|
||||
dim3(blocks),
|
||||
dim3(threadsPerBlock),
|
||||
0,
|
||||
0,
|
||||
static_cast<const int*>(X_d),
|
||||
static_cast<const int*>(Y_d),
|
||||
Z_d,
|
||||
N);
|
||||
HIPCHECK(hipMemcpyDtoH(C_h, Z_d, Nbytes));
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
HipTest::checkVectorADD(A_h, B_h, C_h, N);
|
||||
@@ -73,8 +91,8 @@ int main()
|
||||
HIPCHECK(hipFree(Y_d));
|
||||
HIPCHECK(hipFree(Z_d));
|
||||
}
|
||||
|
||||
|
||||
passed();
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -50,25 +50,43 @@ int main()
|
||||
HIPCHECK(hipMalloc(&Y_d,Nbytes));
|
||||
HIPCHECK(hipMalloc(&Z_d,Nbytes));
|
||||
|
||||
|
||||
|
||||
HIPCHECK(hipSetDevice(0));
|
||||
HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
|
||||
HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
|
||||
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N);
|
||||
hipLaunchKernel(
|
||||
HipTest::vectorADD,
|
||||
dim3(blocks),
|
||||
dim3(threadsPerBlock),
|
||||
0,
|
||||
0,
|
||||
static_cast<const int*>(A_d),
|
||||
static_cast<const int*>(B_d),
|
||||
C_d,
|
||||
N);
|
||||
HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
HipTest::checkVectorADD(A_h, B_h, C_h, N);
|
||||
|
||||
HIPCHECK(hipStreamCreate(&s));
|
||||
|
||||
HIPCHECK(hipStreamCreate(&s));
|
||||
HIPCHECK(hipSetDevice(1));
|
||||
HIPCHECK(hipMemcpyDtoDAsync(X_d, A_d, Nbytes, s));
|
||||
HIPCHECK(hipMemcpyDtoDAsync(X_d, A_d, Nbytes, s));
|
||||
HIPCHECK(hipMemcpyDtoDAsync(Y_d, B_d, Nbytes, s));
|
||||
|
||||
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N);
|
||||
hipLaunchKernel(
|
||||
HipTest::vectorADD,
|
||||
dim3(blocks),
|
||||
dim3(threadsPerBlock),
|
||||
0,
|
||||
0,
|
||||
static_cast<const int*>(X_d),
|
||||
static_cast<const int*>(Y_d),
|
||||
Z_d,
|
||||
N);
|
||||
HIPCHECK(hipMemcpyDtoHAsync(C_h, Z_d, Nbytes, s));
|
||||
HIPCHECK(hipStreamSynchronize(s));
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
|
||||
|
||||
HipTest::checkVectorADD(A_h, B_h, C_h, N);
|
||||
HIPCHECK(hipStreamDestroy(s));
|
||||
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
||||
@@ -78,7 +96,7 @@ int main()
|
||||
}
|
||||
|
||||
passed();
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -48,24 +48,42 @@ int main()
|
||||
HIPCHECK(hipMalloc(&X_d,Nbytes));
|
||||
HIPCHECK(hipMalloc(&Y_d,Nbytes));
|
||||
HIPCHECK(hipMalloc(&Z_d,Nbytes));
|
||||
|
||||
|
||||
HIPCHECK(hipSetDevice(0));
|
||||
HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
|
||||
HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
|
||||
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N);
|
||||
hipLaunchKernel(
|
||||
HipTest::vectorADD,
|
||||
dim3(blocks),
|
||||
dim3(threadsPerBlock),
|
||||
0,
|
||||
0,
|
||||
static_cast<const int*>(A_d),
|
||||
static_cast<const int*>(B_d),
|
||||
C_d,
|
||||
N);
|
||||
HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
HipTest::checkVectorADD(A_h, B_h, C_h, N);
|
||||
|
||||
|
||||
HIPCHECK(hipSetDevice(1));
|
||||
hipMemcpyPeer(X_d, 1, A_d, 0, Nbytes); //this call is eqv to hipMemcpy(hipMemcpyD2D) which goes via stg bufs.
|
||||
hipMemcpyPeer(Y_d, 1, B_d, 0, Nbytes);
|
||||
|
||||
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N);
|
||||
hipLaunchKernel(
|
||||
HipTest::vectorADD,
|
||||
dim3(blocks),
|
||||
dim3(threadsPerBlock),
|
||||
0,
|
||||
0,
|
||||
static_cast<const int*>(X_d),
|
||||
static_cast<const int*>(Y_d),
|
||||
Z_d,
|
||||
N);
|
||||
HIPCHECK(hipMemcpy(C_h, Z_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
HipTest::checkVectorADD(A_h, B_h, C_h, N);
|
||||
|
||||
|
||||
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
||||
HIPCHECK(hipFree(X_d));
|
||||
HIPCHECK(hipFree(Y_d));
|
||||
@@ -74,7 +92,7 @@ int main()
|
||||
passed();
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -51,26 +51,44 @@ int main()
|
||||
HIPCHECK(hipMalloc(&Y_d,Nbytes));
|
||||
HIPCHECK(hipMalloc(&Z_d,Nbytes));
|
||||
|
||||
|
||||
|
||||
HIPCHECK(hipSetDevice(0));
|
||||
HIPCHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
|
||||
HIPCHECK ( hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
|
||||
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N);
|
||||
hipLaunchKernel(
|
||||
HipTest::vectorADD,
|
||||
dim3(blocks),
|
||||
dim3(threadsPerBlock),
|
||||
0,
|
||||
0,
|
||||
static_cast<const int*>(A_d),
|
||||
static_cast<const int*>(B_d),
|
||||
C_d,
|
||||
N);
|
||||
HIPCHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK (hipDeviceSynchronize());
|
||||
HipTest::checkVectorADD(A_h, B_h, C_h, N);
|
||||
|
||||
HIPCHECK(hipStreamCreate(&s));
|
||||
|
||||
HIPCHECK(hipStreamCreate(&s));
|
||||
HIPCHECK(hipSetDevice(1));
|
||||
HIPCHECK(hipMemcpyPeerAsync(X_d, 1, A_d, 0, Nbytes, s));
|
||||
HIPCHECK(hipMemcpyPeerAsync(Y_d, 1, B_d, 0, Nbytes, s));
|
||||
|
||||
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N);
|
||||
hipLaunchKernel(
|
||||
HipTest::vectorADD,
|
||||
dim3(blocks),
|
||||
dim3(threadsPerBlock),
|
||||
0,
|
||||
0,
|
||||
static_cast<const int*>(X_d),
|
||||
static_cast<const int*>(Y_d),
|
||||
Z_d,
|
||||
N);
|
||||
HIPCHECK ( hipMemcpy(C_h, Z_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK (hipDeviceSynchronize());
|
||||
HIPCHECK (hipStreamSynchronize(s));
|
||||
HipTest::checkVectorADD(A_h, B_h, C_h, N);
|
||||
|
||||
|
||||
HIPCHECK(hipStreamDestroy(s));
|
||||
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
||||
HIPCHECK(hipFree(X_d));
|
||||
@@ -79,7 +97,7 @@ int main()
|
||||
}
|
||||
|
||||
passed();
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -63,7 +63,16 @@ void simpleTest1()
|
||||
HIPCHECK ( memcopy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
|
||||
HIPCHECK ( memcopy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
|
||||
|
||||
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N);
|
||||
hipLaunchKernel(
|
||||
HipTest::vectorADD,
|
||||
dim3(blocks),
|
||||
dim3(threadsPerBlock),
|
||||
0,
|
||||
0,
|
||||
static_cast<const int*>(A_d),
|
||||
static_cast<const int*>(B_d),
|
||||
C_d,
|
||||
N);
|
||||
|
||||
HIPCHECK ( memcopy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
|
||||
|
||||
@@ -90,7 +90,16 @@ void simpleVectorAdd(size_t numElements, int iters, hipStream_t stream)
|
||||
|
||||
// This is the null stream?
|
||||
//hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements);
|
||||
hipLaunchKernel(HipTest::vectorADDReverse, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements);
|
||||
hipLaunchKernel(
|
||||
HipTest::vectorADDReverse,
|
||||
dim3(blocks),
|
||||
dim3(threadsPerBlock),
|
||||
0,
|
||||
0,
|
||||
static_cast<const T*>(A_d),
|
||||
static_cast<const T*>(B_d),
|
||||
C_d,
|
||||
numElements);
|
||||
|
||||
MemTraits<C>::Copy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, stream);
|
||||
|
||||
|
||||
@@ -128,7 +128,17 @@ void Streamer<T>::enqueAsync()
|
||||
{
|
||||
printf ("testing: %s numElements=%zu size=%6.2fMB\n", __func__, _numElements, _numElements * sizeof(T) / 1024.0/1024.0);
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements);
|
||||
hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _B_d, _C_d, _numElements, p_repeat);
|
||||
hipLaunchKernel(
|
||||
vectorADDRepeat,
|
||||
dim3(blocks),
|
||||
dim3(threadsPerBlock),
|
||||
0,
|
||||
_stream,
|
||||
static_cast<const T*>(_A_d),
|
||||
static_cast<const T*>(_B_d),
|
||||
_C_d,
|
||||
_numElements,
|
||||
p_repeat);
|
||||
|
||||
}
|
||||
|
||||
@@ -225,7 +235,17 @@ int main(int argc, char *argv[])
|
||||
auto lastStreamer = streamers[s - 1];
|
||||
|
||||
// Dispatch to NULL stream, should wait for prior async activity to complete before beginning:
|
||||
hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/);
|
||||
hipLaunchKernel(
|
||||
vectorADDRepeat,
|
||||
dim3(blocks),
|
||||
dim3(threadsPerBlock),
|
||||
0,
|
||||
0/*nullstream*/,
|
||||
static_cast<const int*>(lastStreamer->_C_d),
|
||||
static_cast<const int*>(lastStreamer->_C_d),
|
||||
nullStreamer->_C_d,
|
||||
numElements,
|
||||
1/*repeat*/);
|
||||
|
||||
|
||||
if (p_db) {
|
||||
@@ -257,7 +277,17 @@ int main(int argc, char *argv[])
|
||||
auto lastStreamer = streamers[s - 1];
|
||||
|
||||
// Dispatch to NULL stream, should wait for prior async activity to complete before beginning:
|
||||
hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/);
|
||||
hipLaunchKernel(
|
||||
vectorADDRepeat,
|
||||
dim3(blocks),
|
||||
dim3(threadsPerBlock),
|
||||
0,
|
||||
0/*nullstream*/,
|
||||
static_cast<const int*>(lastStreamer->_C_d),
|
||||
static_cast<const int*>(lastStreamer->_C_d),
|
||||
nullStreamer->_C_d,
|
||||
numElements,
|
||||
1/*repeat*/);
|
||||
|
||||
nullStreamer->D2H();
|
||||
|
||||
|
||||
@@ -97,7 +97,16 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode s
|
||||
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);
|
||||
// Launch kernel into null stream, should result in C_h == count.
|
||||
hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, 0 /*stream*/, C_d, C_h, numElements, count);
|
||||
hipLaunchKernelGGL(
|
||||
HipTest::addCountReverse,
|
||||
dim3(blocks),
|
||||
dim3(threadsPerBlock),
|
||||
0,
|
||||
0 /*stream*/,
|
||||
static_cast<const int*>(C_d),
|
||||
C_h,
|
||||
numElements,
|
||||
count);
|
||||
HIPCHECK(hipEventRecord(stop, 0/*default*/));
|
||||
|
||||
switch (syncMode) {
|
||||
|
||||
@@ -163,9 +163,27 @@ void Streamer<T>::runAsyncAfter(Streamer<T> *depStreamer, bool waitSameStream)
|
||||
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements);
|
||||
if (_commandType == COMMAND_ADD_REVERSE) {
|
||||
hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count);
|
||||
hipLaunchKernelGGL(
|
||||
HipTest::addCountReverse,
|
||||
dim3(blocks),
|
||||
dim3(threadsPerBlock),
|
||||
0,
|
||||
_stream,
|
||||
static_cast<const T*>(_A_d),
|
||||
_C_d,
|
||||
static_cast<int64_t>(_numElements),
|
||||
static_cast<int>(p_count));
|
||||
} else if (_commandType == COMMAND_ADD_FORWARD) {
|
||||
hipLaunchKernelGGL(HipTest::addCount, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count);
|
||||
hipLaunchKernelGGL(
|
||||
HipTest::addCount,
|
||||
dim3(blocks),
|
||||
dim3(threadsPerBlock),
|
||||
0,
|
||||
_stream,
|
||||
static_cast<const T*>(_A_d),
|
||||
_C_d,
|
||||
_numElements,
|
||||
static_cast<int>(p_count));
|
||||
} else if (_commandType == COMMAND_COPY) {
|
||||
HIPCHECK(hipMemcpyAsync(_C_d, _A_d, _numElements * sizeof(T), hipMemcpyDeviceToDevice, _stream));
|
||||
} else {
|
||||
|
||||
Ссылка в новой задаче
Block a user