migrate program_state logic from header into shared library (phase I) (#1077)
* Revert "Revert "Use COMgr to read Kernel Args Metadata (#1006)"" This reverts commit62e96cb4cf. * Revert "Use COMgr to read Kernel Args Metadata (#1006)" This reverts commit882006555b. * Revert "improve program state commentary" This reverts commitfb2beb0c88. * Revert "load program state once per agent" This reverts commit21f5e142f5. * start moving function_names() into the hip shared lib * start moving code_object_blobs to a new "state" object * Consolidate various program state related static objects into a single program_state object * minor clean up * move more stuffs from functional_grid_launch into program_state * debug make_kernarg * moving lookup for kernargs size_align into program_state * clean up old code for kernarg size and alignment * update hip_module to use newer api in program_state * Create public member functions for program_state * move most program state functions into shared library * Pass the data buffer size to load_executable Otherwise, it can't figure what the data size is just from the char* (since the data is not really a string) * turning free functions in program state into members of program_state_impl * change the free function globals() into a member of program_state_impl * replace the static mutex used for populating globals * moving associate_code_object_symbols_with_host_allocation into program_state_impl * move load_code_object_and_freeze_executable into program_state_impl * moving executables and functions_names into program_state_impl * moving kernels() into program_state_impl * moving functions() into program_state_impl * move get_kernargs into program_state_impl * moving kernel_descriptor into program_state_impl * moving kernargs_size_align calculation into program_state_impl * Changing the handle to program_state_impl to a pointer * moving program_state_impl into a separate inline source file * fixing/cleaning up some header file includes * moving member function for kernargs_size_align into program_state.cpp * moving Kernel_descriptor into program_state.inl * add a new class to manage agent globals * moving all agent globals processing functions into agent_globals_impl * load program state once per agent re-merging PR991 against other program state changes * fix per-agent program state member initialization * cache executables based on elf name, isa, and agent. This avoids program state reloading executables after a shared library is dlopened. re-merging PR1057 against other program state changes * protect executables cache by a global mutex * return ref to executables cache * adapt PR#981 Make hipModuleGetGlobal be in HIP runtime [ROCm/hip commit:f5eb91d53d]
This commit is contained in:
gecommit door
Maneesh Gupta
bovenliggende
1f94348f2e
commit
d0252dfa79
@@ -232,6 +232,7 @@ if(HIP_PLATFORM STREQUAL "hcc")
|
||||
|
||||
set(SOURCE_FILES_RUNTIME
|
||||
src/code_object_bundle.cpp
|
||||
src/program_state.cpp
|
||||
src/hip_clang.cpp
|
||||
src/hip_hcc.cpp
|
||||
src/hip_context.cpp
|
||||
|
||||
@@ -59,7 +59,7 @@ template <
|
||||
typename std::enable_if<n == sizeof...(Ts)>::type* = nullptr>
|
||||
inline std::vector<std::uint8_t> make_kernarg(
|
||||
const std::tuple<Ts...>&,
|
||||
const std::vector<std::pair<std::size_t, std::size_t>>&,
|
||||
const kernargs_size_align&,
|
||||
std::vector<std::uint8_t> kernarg) {
|
||||
return kernarg;
|
||||
}
|
||||
@@ -70,7 +70,7 @@ template <
|
||||
typename std::enable_if<n != sizeof...(Ts)>::type* = nullptr>
|
||||
inline std::vector<std::uint8_t> make_kernarg(
|
||||
const std::tuple<Ts...>& formals,
|
||||
const std::vector<std::pair<std::size_t, std::size_t>>& size_align,
|
||||
const kernargs_size_align& size_align,
|
||||
std::vector<std::uint8_t> kernarg) {
|
||||
using T = typename std::tuple_element<n, std::tuple<Ts...>>::type;
|
||||
|
||||
@@ -86,13 +86,12 @@ inline std::vector<std::uint8_t> make_kernarg(
|
||||
#endif
|
||||
|
||||
kernarg.resize(round_up_to_next_multiple_nonnegative(
|
||||
kernarg.size(), size_align[n].second) + size_align[n].first);
|
||||
kernarg.size(), size_align.alignment(n)) + size_align.size(n));
|
||||
|
||||
std::memcpy(
|
||||
kernarg.data() + kernarg.size() - size_align[n].first,
|
||||
kernarg.data() + kernarg.size() - size_align.size(n),
|
||||
&std::get<n>(formals),
|
||||
size_align[n].first);
|
||||
|
||||
size_align.size(n));
|
||||
return make_kernarg<n + 1>(formals, size_align, std::move(kernarg));
|
||||
}
|
||||
|
||||
@@ -104,45 +103,17 @@ inline std::vector<std::uint8_t> make_kernarg(
|
||||
|
||||
if (sizeof...(Formals) == 0) return {};
|
||||
|
||||
auto it = function_names().find(reinterpret_cast<std::uintptr_t>(kernel));
|
||||
if (it == function_names().cend()) {
|
||||
hip_throw(std::runtime_error{"Undefined __global__ function."});
|
||||
}
|
||||
|
||||
auto it1 = kernargs().find(it->second);
|
||||
if (it1 == kernargs().end()) {
|
||||
hip_throw(std::runtime_error{
|
||||
"Missing metadata for __global__ function: " + it->second});
|
||||
}
|
||||
|
||||
std::tuple<Formals...> to_formals{std::move(actuals)};
|
||||
std::vector<std::uint8_t> kernarg;
|
||||
kernarg.reserve(sizeof(to_formals));
|
||||
|
||||
return make_kernarg<0>(to_formals, it1->second, std::move(kernarg));
|
||||
auto& ps = hip_impl::get_program_state();
|
||||
return make_kernarg<0>(to_formals,
|
||||
ps.get_kernargs_size_align(
|
||||
reinterpret_cast<std::uintptr_t>(kernel)),
|
||||
std::move(kernarg));
|
||||
}
|
||||
|
||||
inline
|
||||
std::string name(std::uintptr_t function_address)
|
||||
{
|
||||
const auto it = function_names().find(function_address);
|
||||
|
||||
if (it == function_names().cend()) {
|
||||
hip_throw(std::runtime_error{
|
||||
"Invalid function passed to hipLaunchKernelGGL."});
|
||||
}
|
||||
|
||||
return it->second;
|
||||
}
|
||||
|
||||
inline
|
||||
std::string name(hsa_agent_t agent)
|
||||
{
|
||||
char n[64]{};
|
||||
hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, n);
|
||||
|
||||
return std::string{n};
|
||||
}
|
||||
|
||||
hsa_agent_t target_agent(hipStream_t stream);
|
||||
|
||||
@@ -156,17 +127,10 @@ void hipLaunchKernelGGLImpl(
|
||||
hipStream_t stream,
|
||||
void** kernarg) {
|
||||
|
||||
auto agent = target_agent(stream);
|
||||
auto it = functions(agent).find(function_address);
|
||||
const auto& kd = hip_impl::get_program_state().kernel_descriptor(function_address,
|
||||
target_agent(stream));
|
||||
|
||||
if (it == functions(agent).cend()) {
|
||||
hip_throw(std::runtime_error{
|
||||
"No device code available for function: " +
|
||||
name(function_address) +
|
||||
", for agent: " + name(agent)});
|
||||
}
|
||||
|
||||
hipModuleLaunchKernel(it->second, numBlocks.x, numBlocks.y, numBlocks.z,
|
||||
hipModuleLaunchKernel(kd, numBlocks.x, numBlocks.y, numBlocks.z,
|
||||
dimBlocks.x, dimBlocks.y, dimBlocks.z, sharedMemBytes,
|
||||
stream, nullptr, kernarg);
|
||||
}
|
||||
@@ -178,8 +142,7 @@ void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
|
||||
std::uint32_t sharedMemBytes, hipStream_t stream,
|
||||
Args... args) {
|
||||
hip_impl::hip_init();
|
||||
auto kernarg = hip_impl::make_kernarg(
|
||||
kernel, std::tuple<Args...>{std::move(args)...});
|
||||
auto kernarg = hip_impl::make_kernarg(kernel, std::tuple<Args...>{std::move(args)...});
|
||||
std::size_t kernarg_size = kernarg.size();
|
||||
|
||||
void* config[]{
|
||||
|
||||
@@ -67,6 +67,7 @@ THE SOFTWARE.
|
||||
#define HIP_LAUNCH_PARAM_END ((void*)0x03)
|
||||
|
||||
#ifdef __cplusplus
|
||||
#include <algorithm>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
@@ -2643,90 +2644,129 @@ std::vector<Agent_global> read_agent_globals(hsa_agent_t agent,
|
||||
hsa_executable_t executable);
|
||||
hsa_agent_t this_agent();
|
||||
|
||||
|
||||
class agent_globals_impl {
|
||||
private:
|
||||
std::pair<
|
||||
std::mutex,
|
||||
std::unordered_map<
|
||||
std::string, std::vector<Agent_global>>> globals_from_module;
|
||||
|
||||
std::unordered_map<
|
||||
hsa_agent_t,
|
||||
std::pair<
|
||||
std::once_flag,
|
||||
std::vector<Agent_global>>> globals_from_process;
|
||||
|
||||
public:
|
||||
|
||||
hipError_t read_agent_global_from_module(hipDeviceptr_t* dptr, size_t* bytes,
|
||||
hipModule_t hmod, const char* name) {
|
||||
// the key of the map would the hash of code object associated with the
|
||||
// hipModule_t instance
|
||||
std::string key(hash_for(hmod));
|
||||
|
||||
if (globals_from_module.second.count(key) == 0) {
|
||||
std::lock_guard<std::mutex> lck{globals_from_module.first};
|
||||
|
||||
if (globals_from_module.second.count(key) == 0) {
|
||||
globals_from_module.second.emplace(
|
||||
key, read_agent_globals(this_agent(), executable_for(hmod)));
|
||||
}
|
||||
}
|
||||
|
||||
const auto it0 = globals_from_module.second.find(key);
|
||||
if (it0 == globals_from_module.second.cend()) {
|
||||
hip_throw(
|
||||
std::runtime_error{"agent_globals data structure corrupted."});
|
||||
}
|
||||
|
||||
std::tie(*dptr, *bytes) = read_global_description(it0->second.cbegin(),
|
||||
it0->second.cend(), name);
|
||||
|
||||
// HACK for SWDEV-173477
|
||||
//
|
||||
// For code objects with global symbols of length 0, ROCR runtime would
|
||||
// ignore them even though they exist in the symbol table. Therefore the
|
||||
// result from read_agent_globals() can't be trusted entirely.
|
||||
//
|
||||
// As a workaround to tame applications which depend on the existence of
|
||||
// global symbols with length 0, always return hipSuccess here.
|
||||
//
|
||||
// This behavior shall be reverted once ROCR runtime has been fixed to
|
||||
// address SWDEV-173477
|
||||
|
||||
//return *dptr ? hipSuccess : hipErrorNotFound;
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
hipError_t read_agent_global_from_process(hipDeviceptr_t* dptr, size_t* bytes,
|
||||
const char* name) {
|
||||
|
||||
auto agent = this_agent();
|
||||
|
||||
std::call_once(globals_from_process[agent].first, [this](hsa_agent_t aa) {
|
||||
std::vector<Agent_global> tmp0;
|
||||
for (auto&& executable : hip_impl::get_program_state().executables(aa)) {
|
||||
auto tmp1 = read_agent_globals(aa, executable);
|
||||
tmp0.insert(tmp0.end(), make_move_iterator(tmp1.begin()),
|
||||
make_move_iterator(tmp1.end()));
|
||||
}
|
||||
globals_from_process[aa].second = move(move(tmp0));
|
||||
}, agent);
|
||||
|
||||
const auto it = globals_from_process.find(agent);
|
||||
|
||||
if (it == globals_from_process.cend()) return hipErrorNotInitialized;
|
||||
|
||||
std::tie(*dptr, *bytes) = read_global_description(it->second.second.cbegin(),
|
||||
it->second.second.cend(), name);
|
||||
|
||||
return *dptr ? hipSuccess : hipErrorNotFound;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
class agent_globals {
|
||||
public:
|
||||
agent_globals() : impl(new agent_globals_impl()) {
|
||||
if (!impl)
|
||||
hip_throw(
|
||||
std::runtime_error{"Error when constructing agent global data structures."});
|
||||
}
|
||||
~agent_globals() { delete impl; }
|
||||
|
||||
hipError_t read_agent_global_from_module(hipDeviceptr_t* dptr, size_t* bytes,
|
||||
hipModule_t hmod, const char* name) {
|
||||
return impl->read_agent_global_from_module(dptr, bytes, hmod, name);
|
||||
}
|
||||
|
||||
hipError_t read_agent_global_from_process(hipDeviceptr_t* dptr, size_t* bytes,
|
||||
const char* name) {
|
||||
return impl->read_agent_global_from_process(dptr, bytes, name);
|
||||
}
|
||||
|
||||
private:
|
||||
agent_globals_impl* impl;
|
||||
};
|
||||
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
hipError_t read_agent_global_from_module(hipDeviceptr_t* dptr, size_t* bytes,
|
||||
hipModule_t hmod, const char* name) {
|
||||
// the key of the map would the hash of code object associated with the
|
||||
// hipModule_t instance
|
||||
static std::unordered_map<
|
||||
std::string, std::vector<Agent_global>> agent_globals;
|
||||
std::string key(hash_for(hmod));
|
||||
|
||||
if (agent_globals.count(key) == 0) {
|
||||
static std::mutex mtx;
|
||||
std::lock_guard<std::mutex> lck{mtx};
|
||||
|
||||
if (agent_globals.count(key) == 0) {
|
||||
agent_globals.emplace(
|
||||
key, read_agent_globals(this_agent(), executable_for(hmod)));
|
||||
}
|
||||
}
|
||||
|
||||
const auto it0 = agent_globals.find(key);
|
||||
if (it0 == agent_globals.cend()) {
|
||||
hip_throw(
|
||||
std::runtime_error{"agent_globals data structure corrupted."});
|
||||
}
|
||||
|
||||
std::tie(*dptr, *bytes) = read_global_description(it0->second.cbegin(),
|
||||
it0->second.cend(), name);
|
||||
|
||||
// HACK for SWDEV-173477
|
||||
//
|
||||
// For code objects with global symbols of length 0, ROCR runtime would
|
||||
// ignore them even though they exist in the symbol table. Therefore the
|
||||
// result from read_agent_globals() can't be trusted entirely.
|
||||
//
|
||||
// As a workaround to tame applications which depend on the existence of
|
||||
// global symbols with length 0, always return hipSuccess here.
|
||||
//
|
||||
// This behavior shall be reverted once ROCR runtime has been fixed to
|
||||
// address SWDEV-173477
|
||||
|
||||
//return *dptr ? hipSuccess : hipErrorNotFound;
|
||||
return hipSuccess;
|
||||
agent_globals& get_agent_globals() {
|
||||
static agent_globals ag;
|
||||
return ag;
|
||||
}
|
||||
|
||||
|
||||
extern "C"
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
hipError_t read_agent_global_from_process(hipDeviceptr_t* dptr, size_t* bytes,
|
||||
const char* name) {
|
||||
static std::unordered_map<hsa_agent_t, std::pair<std::once_flag,
|
||||
std::vector<Agent_global>>> globals;
|
||||
static std::once_flag f;
|
||||
auto agent = this_agent();
|
||||
|
||||
// Create placeholder for each agent in the map.
|
||||
std::call_once(f, []() {
|
||||
for (auto&& x : hip_impl::all_hsa_agents()) {
|
||||
(void)globals[x];
|
||||
}
|
||||
});
|
||||
|
||||
if (globals.find(agent) == globals.cend()) {
|
||||
hip_throw(std::runtime_error{"invalid agent"});
|
||||
}
|
||||
|
||||
std::call_once(globals[agent].first, [](hsa_agent_t aa) {
|
||||
std::vector<Agent_global> tmp0;
|
||||
for (auto&& executable : executables(aa)) {
|
||||
auto tmp1 = read_agent_globals(aa, executable);
|
||||
tmp0.insert(tmp0.end(), make_move_iterator(tmp1.begin()),
|
||||
make_move_iterator(tmp1.end()));
|
||||
}
|
||||
globals[aa].second = move(tmp0);
|
||||
}, agent);
|
||||
|
||||
const auto it = globals.find(agent);
|
||||
|
||||
if (it == globals.cend()) return hipErrorNotInitialized;
|
||||
|
||||
std::tie(*dptr, *bytes) = read_global_description(it->second.second.cbegin(),
|
||||
it->second.second.cend(), name);
|
||||
|
||||
return *dptr ? hipSuccess : hipErrorNotFound;
|
||||
return get_agent_globals().read_agent_global_from_process(dptr, bytes, name);
|
||||
}
|
||||
|
||||
|
||||
} // Namespace hip_impl.
|
||||
|
||||
#if defined(__cplusplus)
|
||||
@@ -2748,6 +2788,7 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes,
|
||||
#endif // __HIP_VDI__
|
||||
|
||||
hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const char* name);
|
||||
|
||||
/**
|
||||
* @brief builds module from code object which resides in host memory. Image is pointer to that
|
||||
* location.
|
||||
|
||||
@@ -22,38 +22,17 @@ THE SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "code_object_bundle.hpp"
|
||||
#include "hsa_helpers.hpp"
|
||||
|
||||
#if !defined(__cpp_exceptions)
|
||||
#define try if (true)
|
||||
#define catch(...) if (false)
|
||||
#endif
|
||||
#include "elfio/elfio.hpp"
|
||||
#if !defined(__cpp_exceptions)
|
||||
#undef try
|
||||
#undef catch
|
||||
#endif
|
||||
|
||||
#include <hsa/amd_hsa_kernel_code.h>
|
||||
#include <hsa/hsa.h>
|
||||
#include <hsa/hsa_ext_amd.h>
|
||||
#include <hsa/hsa_ven_amd_loader.h>
|
||||
|
||||
#include <link.h>
|
||||
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <istream>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
struct ihipModuleSymbol_t;
|
||||
@@ -84,590 +63,48 @@ inline constexpr bool operator==(hsa_isa_t x, hsa_isa_t y) {
|
||||
|
||||
namespace hip_impl {
|
||||
|
||||
std::vector<hsa_agent_t> all_hsa_agents();
|
||||
|
||||
class Kernel_descriptor {
|
||||
std::uint64_t kernel_object_{};
|
||||
amd_kernel_code_t const* kernel_header_{nullptr};
|
||||
std::string name_{};
|
||||
public:
|
||||
Kernel_descriptor() = default;
|
||||
Kernel_descriptor(std::uint64_t kernel_object, const std::string& name)
|
||||
: kernel_object_{kernel_object}, name_{name}
|
||||
{
|
||||
bool supported{false};
|
||||
std::uint16_t min_v{UINT16_MAX};
|
||||
auto r = hsa_system_major_extension_supported(
|
||||
HSA_EXTENSION_AMD_LOADER, 1, &min_v, &supported);
|
||||
|
||||
if (r != HSA_STATUS_SUCCESS || !supported) return;
|
||||
|
||||
hsa_ven_amd_loader_1_01_pfn_t tbl{};
|
||||
|
||||
r = hsa_system_get_major_extension_table(
|
||||
HSA_EXTENSION_AMD_LOADER,
|
||||
1,
|
||||
sizeof(tbl),
|
||||
reinterpret_cast<void*>(&tbl));
|
||||
|
||||
if (r != HSA_STATUS_SUCCESS) return;
|
||||
if (!tbl.hsa_ven_amd_loader_query_host_address) return;
|
||||
|
||||
r = tbl.hsa_ven_amd_loader_query_host_address(
|
||||
reinterpret_cast<void*>(kernel_object_),
|
||||
reinterpret_cast<const void**>(&kernel_header_));
|
||||
|
||||
if (r != HSA_STATUS_SUCCESS) return;
|
||||
}
|
||||
Kernel_descriptor(const Kernel_descriptor&) = default;
|
||||
Kernel_descriptor(Kernel_descriptor&&) = default;
|
||||
~Kernel_descriptor() = default;
|
||||
|
||||
Kernel_descriptor& operator=(const Kernel_descriptor&) = default;
|
||||
Kernel_descriptor& operator=(Kernel_descriptor&&) = default;
|
||||
|
||||
operator hipFunction_t() const { // TODO: this is awful and only meant for illustration.
|
||||
return reinterpret_cast<hipFunction_t>(const_cast<Kernel_descriptor*>(this));
|
||||
}
|
||||
};
|
||||
|
||||
template<typename P>
|
||||
inline
|
||||
ELFIO::section* find_section_if(ELFIO::elfio& reader, P p) {
|
||||
const auto it = std::find_if(
|
||||
reader.sections.begin(), reader.sections.end(), std::move(p));
|
||||
|
||||
return it != reader.sections.end() ? *it : nullptr;
|
||||
}
|
||||
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
const std::unordered_map<
|
||||
hsa_isa_t, std::vector<std::vector<char>>>& code_object_blobs() {
|
||||
static std::unordered_map<hsa_isa_t, std::vector<std::vector<char>>> r;
|
||||
static std::once_flag f;
|
||||
|
||||
std::call_once(f, []() {
|
||||
static std::vector<std::vector<char>> blobs{};
|
||||
|
||||
dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void*) {
|
||||
ELFIO::elfio tmp;
|
||||
|
||||
const auto elf =
|
||||
info->dlpi_addr ? info->dlpi_name : "/proc/self/exe";
|
||||
|
||||
if (!tmp.load(elf)) return 0;
|
||||
|
||||
const auto it = find_section_if(tmp, [](const ELFIO::section* x) {
|
||||
return x->get_name() == ".kernel";
|
||||
});
|
||||
|
||||
if (!it) return 0;
|
||||
|
||||
blobs.emplace_back(it->get_data(), it->get_data() + it->get_size());
|
||||
|
||||
return 0;
|
||||
}, nullptr);
|
||||
|
||||
for (auto&& multi_arch_blob : blobs) {
|
||||
auto it = multi_arch_blob.begin();
|
||||
while (it != multi_arch_blob.end()) {
|
||||
Bundled_code_header tmp{it, multi_arch_blob.end()};
|
||||
|
||||
if (!valid(tmp)) break;
|
||||
|
||||
for (auto&& bundle : bundles(tmp)) {
|
||||
r[triple_to_hsa_isa(bundle.triple)].push_back(bundle.blob);
|
||||
}
|
||||
|
||||
it += tmp.bundled_code_size;
|
||||
};
|
||||
}
|
||||
});
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
struct Symbol {
|
||||
std::string name;
|
||||
ELFIO::Elf64_Addr value = 0;
|
||||
ELFIO::Elf_Xword size = 0;
|
||||
ELFIO::Elf_Half sect_idx = 0;
|
||||
std::uint8_t bind = 0;
|
||||
std::uint8_t type = 0;
|
||||
std::uint8_t other = 0;
|
||||
};
|
||||
|
||||
inline
|
||||
Symbol read_symbol(const ELFIO::symbol_section_accessor& section,
|
||||
unsigned int idx) {
|
||||
assert(idx < section.get_symbols_num());
|
||||
|
||||
Symbol r;
|
||||
section.get_symbol(
|
||||
idx, r.name, r.value, r.size, r.bind, r.type, r.sect_idx, r.other);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
const std::unordered_map<
|
||||
std::string,
|
||||
std::pair<ELFIO::Elf64_Addr, ELFIO::Elf_Xword>>& symbol_addresses() {
|
||||
static std::unordered_map<
|
||||
std::string, std::pair<ELFIO::Elf64_Addr, ELFIO::Elf_Xword>> r;
|
||||
static std::once_flag f;
|
||||
|
||||
std::call_once(f, []() {
|
||||
dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void*) {
|
||||
ELFIO::elfio tmp;
|
||||
const auto elf =
|
||||
info->dlpi_addr ? info->dlpi_name : "/proc/self/exe";
|
||||
|
||||
if (!tmp.load(elf)) return 0;
|
||||
|
||||
auto it = find_section_if(tmp, [](const ELFIO::section* x) {
|
||||
return x->get_type() == SHT_SYMTAB;
|
||||
});
|
||||
|
||||
if (!it) return 0;
|
||||
|
||||
const ELFIO::symbol_section_accessor symtab{tmp, it};
|
||||
|
||||
for (auto i = 0u; i != symtab.get_symbols_num(); ++i) {
|
||||
auto s = read_symbol(symtab, i);
|
||||
|
||||
if (s.type != STT_OBJECT || s.sect_idx == SHN_UNDEF) continue;
|
||||
|
||||
const auto addr = s.value + info->dlpi_addr;
|
||||
r.emplace(std::move(s.name), std::make_pair(addr, s.size));
|
||||
}
|
||||
|
||||
return 0;
|
||||
}, nullptr);
|
||||
});
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
std::unordered_map<std::string, void*>& globals() {
|
||||
static std::unordered_map<std::string, void*> r;
|
||||
static std::once_flag f;
|
||||
|
||||
std::call_once(f, []() { r.reserve(symbol_addresses().size()); });
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
std::vector<std::string> copy_names_of_undefined_symbols(
|
||||
const ELFIO::symbol_section_accessor& section) {
|
||||
std::vector<std::string> r;
|
||||
|
||||
for (auto i = 0u; i != section.get_symbols_num(); ++i) {
|
||||
// TODO: this is boyscout code, caching the temporaries
|
||||
// may be of worth.
|
||||
auto tmp = read_symbol(section, i);
|
||||
if (tmp.sect_idx != SHN_UNDEF || tmp.name.empty()) continue;
|
||||
|
||||
r.push_back(std::move(tmp.name));
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
[[noreturn]]
|
||||
void hip_throw(const std::exception&);
|
||||
|
||||
inline
|
||||
void associate_code_object_symbols_with_host_allocation(
|
||||
const ELFIO::elfio& reader,
|
||||
ELFIO::section* code_object_dynsym,
|
||||
hsa_agent_t agent,
|
||||
hsa_executable_t executable) {
|
||||
if (!code_object_dynsym) return;
|
||||
class kernargs_size_align;
|
||||
class program_state_impl;
|
||||
class program_state {
|
||||
public:
|
||||
program_state();
|
||||
~program_state();
|
||||
|
||||
const auto undefined_symbols = copy_names_of_undefined_symbols(
|
||||
ELFIO::symbol_section_accessor{reader, code_object_dynsym});
|
||||
hipFunction_t kernel_descriptor(std::uintptr_t,
|
||||
hsa_agent_t);
|
||||
|
||||
kernargs_size_align get_kernargs_size_align(std::uintptr_t);
|
||||
hsa_executable_t load_executable(const char*, const size_t,
|
||||
hsa_executable_t,
|
||||
hsa_agent_t);
|
||||
|
||||
for (auto&& x : undefined_symbols) {
|
||||
if (globals().find(x) != globals().cend()) return;
|
||||
void* global_addr_by_name(const char* name);
|
||||
|
||||
const auto it1 = symbol_addresses().find(x);
|
||||
// to fix later
|
||||
const std::vector<hsa_executable_t>& executables(hsa_agent_t agent);
|
||||
|
||||
if (it1 == symbol_addresses().cend()) {
|
||||
hip_throw(std::runtime_error{
|
||||
"Global symbol: " + x + " is undefined."});
|
||||
}
|
||||
program_state(const program_state&) = delete;
|
||||
|
||||
static std::mutex mtx;
|
||||
std::lock_guard<std::mutex> lck{mtx};
|
||||
private:
|
||||
program_state_impl* impl;
|
||||
};
|
||||
|
||||
if (globals().find(x) != globals().cend()) return;
|
||||
|
||||
globals().emplace(x, (void*)(it1->second.first));
|
||||
void* p = nullptr;
|
||||
hsa_amd_memory_lock(
|
||||
reinterpret_cast<void*>(it1->second.first),
|
||||
it1->second.second,
|
||||
nullptr, // All agents.
|
||||
0,
|
||||
&p);
|
||||
|
||||
hsa_executable_agent_global_variable_define(
|
||||
executable, agent, x.c_str(), p);
|
||||
}
|
||||
}
|
||||
|
||||
inline
|
||||
void load_code_object_and_freeze_executable(
|
||||
const std::string& file, hsa_agent_t agent, hsa_executable_t executable) {
|
||||
// TODO: the following sequence is inefficient, should be refactored
|
||||
// into a single load of the file and subsequent ELFIO
|
||||
// processing.
|
||||
static const auto cor_deleter = [](hsa_code_object_reader_t* p) {
|
||||
if (!p) return;
|
||||
|
||||
hsa_code_object_reader_destroy(*p);
|
||||
delete p;
|
||||
};
|
||||
|
||||
using RAII_code_reader =
|
||||
std::unique_ptr<hsa_code_object_reader_t, decltype(cor_deleter)>;
|
||||
|
||||
if (file.empty()) return;
|
||||
|
||||
RAII_code_reader tmp{new hsa_code_object_reader_t, cor_deleter};
|
||||
hsa_code_object_reader_create_from_memory(
|
||||
file.data(), file.size(), tmp.get());
|
||||
|
||||
hsa_executable_load_agent_code_object(
|
||||
executable, agent, *tmp, nullptr, nullptr);
|
||||
|
||||
hsa_executable_freeze(executable, nullptr);
|
||||
|
||||
static std::vector<RAII_code_reader> code_readers;
|
||||
static std::mutex mtx;
|
||||
|
||||
std::lock_guard<std::mutex> lck{mtx};
|
||||
code_readers.push_back(move(tmp));
|
||||
}
|
||||
|
||||
inline
|
||||
hsa_executable_t load_executable(const std::string& file,
|
||||
hsa_executable_t executable,
|
||||
hsa_agent_t agent) {
|
||||
ELFIO::elfio reader;
|
||||
std::stringstream tmp{file};
|
||||
|
||||
if (!reader.load(tmp)) return hsa_executable_t{};
|
||||
|
||||
const auto code_object_dynsym = find_section_if(
|
||||
reader, [](const ELFIO::section* x) {
|
||||
return x->get_type() == SHT_DYNSYM;
|
||||
});
|
||||
|
||||
associate_code_object_symbols_with_host_allocation(reader,
|
||||
code_object_dynsym,
|
||||
agent, executable);
|
||||
|
||||
load_code_object_and_freeze_executable(file, agent, executable);
|
||||
|
||||
return executable;
|
||||
}
|
||||
class kernargs_size_align {
|
||||
public:
|
||||
std::size_t size(std::size_t n) const;
|
||||
std::size_t alignment(std::size_t n) const;
|
||||
private:
|
||||
const void* handle;
|
||||
friend kernargs_size_align program_state::get_kernargs_size_align(std::uintptr_t);
|
||||
};
|
||||
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
const std::vector<hsa_executable_t>& executables(hsa_agent_t agent) {
|
||||
static std::unordered_map<hsa_agent_t, std::pair<std::once_flag,
|
||||
std::vector<hsa_executable_t>>> r;
|
||||
static std::once_flag f;
|
||||
|
||||
// Create placeholder for each agent in the map.
|
||||
std::call_once(f, []() {
|
||||
for (auto&& x : hip_impl::all_hsa_agents()) {
|
||||
(void)r[x];
|
||||
}
|
||||
});
|
||||
|
||||
if (r.find(agent) == r.cend()) {
|
||||
hip_throw(std::runtime_error{"invalid agent"});
|
||||
}
|
||||
|
||||
std::call_once(r[agent].first, [](hsa_agent_t aa) {
|
||||
hsa_agent_iterate_isas(aa, [](hsa_isa_t x, void* pa) {
|
||||
const auto it = code_object_blobs().find(x);
|
||||
|
||||
if (it == code_object_blobs().cend()) return HSA_STATUS_SUCCESS;
|
||||
|
||||
hsa_agent_t a = *static_cast<hsa_agent_t*>(pa);
|
||||
|
||||
for (auto&& blob : it->second) {
|
||||
hsa_executable_t tmp = {};
|
||||
|
||||
hsa_executable_create_alt(
|
||||
HSA_PROFILE_FULL,
|
||||
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
|
||||
nullptr,
|
||||
&tmp);
|
||||
|
||||
// TODO: this is massively inefficient and only meant for
|
||||
// illustration.
|
||||
std::string blob_to_str{blob.cbegin(), blob.cend()};
|
||||
tmp = load_executable(blob_to_str, tmp, a);
|
||||
|
||||
if (tmp.handle) r[a].second.push_back(tmp);
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}, &aa);
|
||||
}, agent);
|
||||
|
||||
return r[agent].second;
|
||||
}
|
||||
|
||||
inline
|
||||
std::vector<std::pair<std::uintptr_t, std::string>> function_names_for(
|
||||
const ELFIO::elfio& reader, ELFIO::section* symtab) {
|
||||
std::vector<std::pair<std::uintptr_t, std::string>> r;
|
||||
ELFIO::symbol_section_accessor symbols{reader, symtab};
|
||||
|
||||
for (auto i = 0u; i != symbols.get_symbols_num(); ++i) {
|
||||
// TODO: this is boyscout code, caching the temporaries
|
||||
// may be of worth.
|
||||
auto tmp = read_symbol(symbols, i);
|
||||
|
||||
if (tmp.type != STT_FUNC) continue;
|
||||
if (tmp.type == SHN_UNDEF) continue;
|
||||
if (tmp.name.empty()) continue;
|
||||
|
||||
r.emplace_back(tmp.value, tmp.name);
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
const std::unordered_map<std::uintptr_t, std::string>& function_names() {
|
||||
static std::unordered_map<std::uintptr_t, std::string> r;
|
||||
static std::once_flag f;
|
||||
|
||||
std::call_once(f, []() {
|
||||
dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void*) {
|
||||
ELFIO::elfio tmp;
|
||||
const auto elf =
|
||||
info->dlpi_addr ? info->dlpi_name : "/proc/self/exe";
|
||||
|
||||
if (!tmp.load(elf)) return 0;
|
||||
|
||||
const auto it = find_section_if(tmp, [](const ELFIO::section* x) {
|
||||
return x->get_type() == SHT_SYMTAB;
|
||||
});
|
||||
|
||||
if (!it) return 0;
|
||||
|
||||
auto names = function_names_for(tmp, it);
|
||||
for (auto&& x : names) x.first += info->dlpi_addr;
|
||||
|
||||
r.insert(
|
||||
std::make_move_iterator(names.begin()),
|
||||
std::make_move_iterator(names.end()));
|
||||
|
||||
return 0;
|
||||
}, nullptr);
|
||||
});
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
const std::unordered_map<
|
||||
std::string, std::vector<hsa_executable_symbol_t>>& kernels(hsa_agent_t agent) {
|
||||
static std::unordered_map<hsa_agent_t, std::pair<std::once_flag,
|
||||
std::unordered_map<std::string, std::vector<hsa_executable_symbol_t>>>> r;
|
||||
static std::once_flag f;
|
||||
|
||||
// Create placeholder for each agent in the map.
|
||||
std::call_once(f, []() {
|
||||
for (auto&& x : hip_impl::all_hsa_agents()) {
|
||||
(void)r[x];
|
||||
}
|
||||
});
|
||||
|
||||
if (r.find(agent) == r.cend()) {
|
||||
hip_throw(std::runtime_error{"invalid agent"});
|
||||
}
|
||||
|
||||
std::call_once(r[agent].first, [](hsa_agent_t aa) {
|
||||
static const auto copy_kernels = [](
|
||||
hsa_executable_t, hsa_agent_t a, hsa_executable_symbol_t x, void*) {
|
||||
if (type(x) == HSA_SYMBOL_KIND_KERNEL) r[a].second[name(x)].push_back(x);
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
};
|
||||
|
||||
for (auto&& executable : executables(aa)) {
|
||||
hsa_executable_iterate_agent_symbols(
|
||||
executable, aa, copy_kernels, nullptr);
|
||||
}
|
||||
}, agent);
|
||||
|
||||
return r[agent].second;
|
||||
}
|
||||
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
const std::unordered_map<
|
||||
std::uintptr_t,
|
||||
Kernel_descriptor>& functions(hsa_agent_t agent) {
|
||||
static std::unordered_map<hsa_agent_t, std::pair<std::once_flag,
|
||||
std::unordered_map<std::uintptr_t, Kernel_descriptor>>> r;
|
||||
static std::once_flag f;
|
||||
|
||||
// Create placeholder for each agent in the map.
|
||||
std::call_once(f, []() {
|
||||
for (auto&& x : hip_impl::all_hsa_agents()) {
|
||||
(void)r[x];
|
||||
}
|
||||
});
|
||||
|
||||
if (r.find(agent) == r.cend()) {
|
||||
hip_throw(std::runtime_error{"invalid agent"});
|
||||
}
|
||||
|
||||
std::call_once(r[agent].first, [](hsa_agent_t aa) {
|
||||
for (auto&& function : function_names()) {
|
||||
const auto it = kernels(aa).find(function.second);
|
||||
|
||||
if (it == kernels(aa).cend()) continue;
|
||||
|
||||
for (auto&& kernel_symbol : it->second) {
|
||||
r[aa].second.emplace(
|
||||
function.first,
|
||||
Kernel_descriptor{kernel_object(kernel_symbol), it->first});
|
||||
}
|
||||
}
|
||||
}, agent);
|
||||
|
||||
return r[agent].second;
|
||||
}
|
||||
|
||||
inline
|
||||
std::size_t parse_args(
|
||||
const std::string& metadata,
|
||||
std::size_t f,
|
||||
std::size_t l,
|
||||
std::vector<std::pair<std::size_t, std::size_t>>& size_align) {
|
||||
if (f == l) return f;
|
||||
if (!size_align.empty()) return l;
|
||||
|
||||
do {
|
||||
static constexpr size_t size_sz{5};
|
||||
f = metadata.find("Size:", f) + size_sz;
|
||||
|
||||
if (l <= f) return f;
|
||||
|
||||
auto size = std::strtoul(&metadata[f], nullptr, 10);
|
||||
|
||||
static constexpr size_t align_sz{6};
|
||||
f = metadata.find("Align:", f) + align_sz;
|
||||
|
||||
char* l{};
|
||||
auto align = std::strtoul(&metadata[f], &l, 10);
|
||||
|
||||
f += (l - &metadata[f]) + 1;
|
||||
|
||||
size_align.emplace_back(size, align);
|
||||
} while (true);
|
||||
}
|
||||
|
||||
inline
|
||||
void read_kernarg_metadata(
|
||||
ELFIO::elfio& reader,
|
||||
std::unordered_map<
|
||||
std::string,
|
||||
std::vector<std::pair<std::size_t, std::size_t>>>& kernargs) {
|
||||
// TODO: this is inefficient.
|
||||
auto it = find_section_if(reader, [](const ELFIO::section* x) {
|
||||
return x->get_type() == SHT_NOTE;
|
||||
});
|
||||
|
||||
if (!it) return;
|
||||
|
||||
const ELFIO::note_section_accessor acc{reader, it};
|
||||
for (decltype(acc.get_notes_num()) i = 0; i != acc.get_notes_num(); ++i) {
|
||||
ELFIO::Elf_Word type{};
|
||||
std::string name{};
|
||||
void* desc{};
|
||||
ELFIO::Elf_Word desc_size{};
|
||||
|
||||
acc.get_note(i, type, name, desc, desc_size);
|
||||
|
||||
if (name != "AMD") continue; // TODO: switch to using NT_AMD_AMDGPU_HSA_METADATA.
|
||||
|
||||
std::string tmp{
|
||||
static_cast<char*>(desc), static_cast<char*>(desc) + desc_size};
|
||||
|
||||
auto dx = tmp.find("Kernels:");
|
||||
|
||||
if (dx == std::string::npos) continue;
|
||||
|
||||
static constexpr decltype(tmp.size()) kernels_sz{8};
|
||||
dx += kernels_sz;
|
||||
|
||||
do {
|
||||
dx = tmp.find("Name:", dx);
|
||||
|
||||
if (dx == std::string::npos) break;
|
||||
|
||||
static constexpr decltype(tmp.size()) name_sz{5};
|
||||
dx = tmp.find_first_not_of(" '", dx + name_sz);
|
||||
|
||||
auto fn = tmp.substr(dx, tmp.find_first_of("'\n", dx) - dx);
|
||||
dx += fn.size();
|
||||
|
||||
auto dx1 = tmp.find("CodeProps", dx);
|
||||
dx = tmp.find("Args:", dx);
|
||||
|
||||
if (dx1 < dx) {
|
||||
dx = dx1;
|
||||
continue;
|
||||
}
|
||||
if (dx == std::string::npos) break;
|
||||
|
||||
static constexpr decltype(tmp.size()) args_sz{5};
|
||||
dx = parse_args(tmp, dx + args_sz, dx1, kernargs[fn]);
|
||||
} while (true);
|
||||
}
|
||||
}
|
||||
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
const std::unordered_map<
|
||||
std::string, std::vector<std::pair<std::size_t, std::size_t>>>& kernargs() {
|
||||
static std::unordered_map<
|
||||
std::string, std::vector<std::pair<std::size_t, std::size_t>>> r;
|
||||
static std::once_flag f;
|
||||
|
||||
std::call_once(f, []() {
|
||||
for (auto&& isa_blobs : code_object_blobs()) {
|
||||
for (auto&& blob : isa_blobs.second) {
|
||||
std::stringstream tmp{std::string{blob.cbegin(), blob.cend()}};
|
||||
|
||||
ELFIO::elfio reader;
|
||||
|
||||
if (!reader.load(tmp)) continue;
|
||||
|
||||
read_kernarg_metadata(reader, r);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return r;
|
||||
program_state& get_program_state() {
|
||||
static program_state ps;
|
||||
return ps;
|
||||
}
|
||||
} // Namespace hip_impl.
|
||||
|
||||
@@ -85,7 +85,9 @@ __hipRegisterFatBinary(const void* data)
|
||||
reinterpret_cast<uintptr_t>(header) + desc->offset), desc->size};
|
||||
if (HIP_DUMP_CODE_OBJECT)
|
||||
__hipDumpCodeObject(image);
|
||||
module->executable = hip_impl::load_executable(image, module->executable, agent);
|
||||
module->executable = hip_impl::get_program_state().load_executable(image.data(), image.size(),
|
||||
module->executable,
|
||||
agent);
|
||||
|
||||
if (module->executable.handle) {
|
||||
modules->at(deviceId) = module;
|
||||
|
||||
@@ -2489,4 +2489,14 @@ namespace hip_impl {
|
||||
std::terminate();
|
||||
#endif
|
||||
}
|
||||
|
||||
std::mutex executables_cache_mutex;
|
||||
|
||||
std::vector<hsa_executable_t>& executables_cache(
|
||||
std::string elf, hsa_isa_t isa, hsa_agent_t agent) {
|
||||
static std::unordered_map<std::string,
|
||||
std::unordered_map<hsa_isa_t,
|
||||
std::unordered_map<hsa_agent_t, std::vector<hsa_executable_t>>>> cache;
|
||||
return cache[elf][isa][agent];
|
||||
}
|
||||
} // Namespace hip_impl.
|
||||
|
||||
@@ -25,6 +25,7 @@ THE SOFTWARE.
|
||||
#include "hip/hcc_detail/hsa_helpers.hpp"
|
||||
#include "hip/hcc_detail/program_state.hpp"
|
||||
#include "hip_hcc_internal.h"
|
||||
#include "program_state.inl"
|
||||
#include "trace_helper.h"
|
||||
|
||||
#include <hsa/amd_hsa_kernel_code.h>
|
||||
@@ -289,7 +290,7 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes,
|
||||
|
||||
if (!name) return hipErrorNotInitialized;
|
||||
|
||||
return hip_impl::read_agent_global_from_module(dptr, bytes, hmod, name);
|
||||
return hip_impl::get_agent_globals().read_agent_global_from_module(dptr, bytes, hmod, name);
|
||||
}
|
||||
|
||||
namespace hip_impl {
|
||||
@@ -512,11 +513,8 @@ hipError_t hipFuncGetAttributes(hipFuncAttributes* attr, const void* func)
|
||||
if (!func) return hipErrorInvalidDeviceFunction;
|
||||
|
||||
auto agent = this_agent();
|
||||
const auto it = functions(agent).find(reinterpret_cast<uintptr_t>(func));
|
||||
|
||||
if (it == functions(agent).cend()) return hipErrorInvalidDeviceFunction;
|
||||
|
||||
const auto header = static_cast<hipFunction_t>(it->second)->_header;
|
||||
auto kd = get_program_state().kernel_descriptor(reinterpret_cast<uintptr_t>(func), agent);
|
||||
const auto header = kd->_header;
|
||||
|
||||
if (!header) throw runtime_error{"Ill-formed Kernel_descriptor."};
|
||||
|
||||
@@ -548,7 +546,8 @@ hipError_t ihipModuleLoadData(hipModule_t* module, const void* image) {
|
||||
|
||||
auto content = tmp.empty() ? read_elf_file_as_string(image) : tmp;
|
||||
|
||||
(*module)->executable = load_executable(content, (*module)->executable,
|
||||
(*module)->executable = get_program_state().load_executable(
|
||||
content.data(), content.size(), (*module)->executable,
|
||||
this_agent());
|
||||
|
||||
// compute the hash of the code object
|
||||
@@ -591,10 +590,10 @@ hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const
|
||||
if (!texRef) return ihipLogStatus(hipErrorInvalidValue);
|
||||
|
||||
if (!hmod || !name) return ihipLogStatus(hipErrorNotInitialized);
|
||||
|
||||
auto addr = get_program_state().global_addr_by_name(name);
|
||||
if (addr == nullptr) return ihipLogStatus(hipErrorInvalidValue);
|
||||
|
||||
const auto it = globals().find(name);
|
||||
if (it == globals().end()) return ihipLogStatus(hipErrorInvalidValue);
|
||||
|
||||
*texRef = reinterpret_cast<textureReference*>(it->second);
|
||||
*texRef = reinterpret_cast<textureReference*>(addr);
|
||||
return ihipLogStatus(hipSuccess);
|
||||
}
|
||||
|
||||
@@ -0,0 +1,63 @@
|
||||
#include "../include/hip/hcc_detail/program_state.hpp"
|
||||
|
||||
#include <hsa/hsa.h>
|
||||
|
||||
#include <cstdint>
|
||||
#include <stdexcept>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
// contains implementation of program_state_impl
|
||||
#include "program_state.inl"
|
||||
|
||||
namespace hip_impl {
|
||||
|
||||
std::size_t kernargs_size_align::kernargs_size_align::size(std::size_t n) const{
|
||||
return (*reinterpret_cast<const std::vector<std::pair<std::size_t, std::size_t>>*>(handle))[n].first;
|
||||
}
|
||||
|
||||
std::size_t kernargs_size_align::alignment(std::size_t n) const{
|
||||
return (*reinterpret_cast<const std::vector<std::pair<std::size_t, std::size_t>>*>(handle))[n].second;
|
||||
}
|
||||
|
||||
program_state::program_state() :
|
||||
impl(new program_state_impl) {
|
||||
if (!impl) hip_throw(std::runtime_error {
|
||||
"Unknown error when constructing program state."});
|
||||
}
|
||||
|
||||
program_state::~program_state() {
|
||||
delete(impl);
|
||||
}
|
||||
|
||||
void* program_state::global_addr_by_name(const char* name) {
|
||||
const auto it = impl->get_globals().find(name);
|
||||
if (it == impl->get_globals().end())
|
||||
return nullptr;
|
||||
else
|
||||
return it->second;
|
||||
}
|
||||
|
||||
hsa_executable_t program_state::load_executable(const char* data,
|
||||
const size_t data_size,
|
||||
hsa_executable_t executable,
|
||||
hsa_agent_t agent) {
|
||||
return impl->load_executable(data, data_size, executable, agent);
|
||||
}
|
||||
|
||||
const std::vector<hsa_executable_t>& program_state::executables(hsa_agent_t agent) {
|
||||
return impl->get_executables(agent);
|
||||
}
|
||||
|
||||
hipFunction_t program_state::kernel_descriptor(std::uintptr_t function_address,
|
||||
hsa_agent_t agent) {
|
||||
auto& kd = impl->kernel_descriptor(function_address, agent);
|
||||
return kd;
|
||||
}
|
||||
|
||||
kernargs_size_align program_state::get_kernargs_size_align(std::uintptr_t kernel) {
|
||||
kernargs_size_align t;
|
||||
t.handle = reinterpret_cast<const void*>(&impl->kernargs_size_align(kernel));
|
||||
return t;
|
||||
}
|
||||
};
|
||||
@@ -0,0 +1,713 @@
|
||||
#include "../include/hip/hcc_detail/program_state.hpp"
|
||||
|
||||
#include "../include/hip/hcc_detail/code_object_bundle.hpp"
|
||||
#include "../include/hip/hcc_detail/hsa_helpers.hpp"
|
||||
|
||||
#if !defined(__cpp_exceptions)
|
||||
#define try if (true)
|
||||
#define catch(...) if (false)
|
||||
#endif
|
||||
#include "../include/hip/hcc_detail/elfio/elfio.hpp"
|
||||
#if !defined(__cpp_exceptions)
|
||||
#undef try
|
||||
#undef catch
|
||||
#endif
|
||||
|
||||
#include <hsa/amd_hsa_kernel_code.h>
|
||||
#include <hsa/hsa.h>
|
||||
#include <hsa/hsa_ext_amd.h>
|
||||
#include <hsa/hsa_ven_amd_loader.h>
|
||||
|
||||
#include <link.h>
|
||||
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace hip_impl {
|
||||
|
||||
[[noreturn]]
|
||||
void hip_throw(const std::exception&);
|
||||
|
||||
std::vector<hsa_agent_t> all_hsa_agents();
|
||||
|
||||
extern std::mutex executables_cache_mutex;
|
||||
|
||||
std::vector<hsa_executable_t>& executables_cache(std::string, hsa_isa_t, hsa_agent_t);
|
||||
|
||||
template<typename P>
|
||||
inline
|
||||
ELFIO::section* find_section_if(ELFIO::elfio& reader, P p) {
|
||||
const auto it = std::find_if(
|
||||
reader.sections.begin(), reader.sections.end(), std::move(p));
|
||||
|
||||
return it != reader.sections.end() ? *it : nullptr;
|
||||
}
|
||||
|
||||
struct Symbol {
|
||||
std::string name;
|
||||
ELFIO::Elf64_Addr value = 0;
|
||||
ELFIO::Elf_Xword size = 0;
|
||||
ELFIO::Elf_Half sect_idx = 0;
|
||||
std::uint8_t bind = 0;
|
||||
std::uint8_t type = 0;
|
||||
std::uint8_t other = 0;
|
||||
};
|
||||
|
||||
class Kernel_descriptor {
|
||||
std::uint64_t kernel_object_{};
|
||||
amd_kernel_code_t const* kernel_header_{nullptr};
|
||||
std::string name_{};
|
||||
public:
|
||||
Kernel_descriptor() = default;
|
||||
Kernel_descriptor(std::uint64_t kernel_object, const std::string& name)
|
||||
: kernel_object_{kernel_object}, name_{name}
|
||||
{
|
||||
bool supported{false};
|
||||
std::uint16_t min_v{UINT16_MAX};
|
||||
auto r = hsa_system_major_extension_supported(
|
||||
HSA_EXTENSION_AMD_LOADER, 1, &min_v, &supported);
|
||||
|
||||
if (r != HSA_STATUS_SUCCESS || !supported) return;
|
||||
|
||||
hsa_ven_amd_loader_1_01_pfn_t tbl{};
|
||||
|
||||
r = hsa_system_get_major_extension_table(
|
||||
HSA_EXTENSION_AMD_LOADER,
|
||||
1,
|
||||
sizeof(tbl),
|
||||
reinterpret_cast<void*>(&tbl));
|
||||
|
||||
if (r != HSA_STATUS_SUCCESS) return;
|
||||
if (!tbl.hsa_ven_amd_loader_query_host_address) return;
|
||||
|
||||
r = tbl.hsa_ven_amd_loader_query_host_address(
|
||||
reinterpret_cast<void*>(kernel_object_),
|
||||
reinterpret_cast<const void**>(&kernel_header_));
|
||||
|
||||
if (r != HSA_STATUS_SUCCESS) return;
|
||||
}
|
||||
Kernel_descriptor(const Kernel_descriptor&) = default;
|
||||
Kernel_descriptor(Kernel_descriptor&&) = default;
|
||||
~Kernel_descriptor() = default;
|
||||
|
||||
Kernel_descriptor& operator=(const Kernel_descriptor&) = default;
|
||||
Kernel_descriptor& operator=(Kernel_descriptor&&) = default;
|
||||
|
||||
operator hipFunction_t() const { // TODO: this is awful and only meant for illustration.
|
||||
return reinterpret_cast<hipFunction_t>(const_cast<Kernel_descriptor*>(this));
|
||||
}
|
||||
};
|
||||
|
||||
class program_state_impl {
|
||||
|
||||
public:
|
||||
|
||||
std::pair<
|
||||
std::once_flag,
|
||||
std::unordered_map<
|
||||
std::string,
|
||||
std::unordered_map<
|
||||
hsa_isa_t,
|
||||
std::vector<std::vector<char>>>>> code_object_blobs;
|
||||
|
||||
std::pair<
|
||||
std::once_flag,
|
||||
std::unordered_map<
|
||||
std::string,
|
||||
std::pair<ELFIO::Elf64_Addr, ELFIO::Elf_Xword>>> symbol_addresses;
|
||||
|
||||
std::unordered_map<
|
||||
hsa_agent_t,
|
||||
std::pair<
|
||||
std::once_flag,
|
||||
std::vector<hsa_executable_t>>> executables;
|
||||
|
||||
std::unordered_map<
|
||||
hsa_agent_t,
|
||||
std::pair<
|
||||
std::once_flag,
|
||||
std::unordered_map<
|
||||
std::string,
|
||||
std::vector<hsa_executable_symbol_t>>>> kernels;
|
||||
|
||||
std::pair<
|
||||
std::once_flag,
|
||||
std::unordered_map<
|
||||
std::string, std::vector<std::pair<std::size_t, std::size_t>>>> kernargs;
|
||||
|
||||
std::pair<
|
||||
std::once_flag,
|
||||
std::unordered_map<std::uintptr_t, std::string>> function_names;
|
||||
|
||||
std::unordered_map<
|
||||
hsa_agent_t,
|
||||
std::pair<
|
||||
std::once_flag,
|
||||
std::unordered_map<
|
||||
std::uintptr_t,
|
||||
Kernel_descriptor>>> functions;
|
||||
|
||||
std::tuple<
|
||||
std::once_flag,
|
||||
std::mutex,
|
||||
std::unordered_map<std::string, void*>> globals;
|
||||
|
||||
using RAII_code_reader =
|
||||
std::unique_ptr<hsa_code_object_reader_t,
|
||||
std::function<void(hsa_code_object_reader_t*)>>;
|
||||
std::pair<
|
||||
std::mutex,
|
||||
std::vector<RAII_code_reader>> code_readers;
|
||||
|
||||
program_state_impl() {
|
||||
// Create placeholder for each agent for the per-agent members.
|
||||
for (auto&& x : hip_impl::all_hsa_agents()) {
|
||||
(void)executables[x];
|
||||
(void)kernels[x];
|
||||
(void)functions[x];
|
||||
}
|
||||
}
|
||||
|
||||
const std::unordered_map<
|
||||
std::string,
|
||||
std::unordered_map<
|
||||
hsa_isa_t,
|
||||
std::vector<std::vector<char>>>>& get_code_object_blobs() {
|
||||
|
||||
std::call_once(code_object_blobs.first, [this]() {
|
||||
dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void* p) {
|
||||
ELFIO::elfio tmp;
|
||||
|
||||
const auto elf =
|
||||
info->dlpi_addr ? info->dlpi_name : "/proc/self/exe";
|
||||
|
||||
if (!tmp.load(elf)) return 0;
|
||||
|
||||
const auto it = find_section_if(tmp, [](const ELFIO::section* x) {
|
||||
return x->get_name() == ".kernel";
|
||||
});
|
||||
|
||||
if (!it) return 0;
|
||||
|
||||
auto& impl = *static_cast<program_state_impl*>(p);
|
||||
|
||||
std::vector<char> multi_arch_blob(it->get_data(), it->get_data() + it->get_size());
|
||||
auto blob_it = multi_arch_blob.begin();
|
||||
while (blob_it != multi_arch_blob.end()) {
|
||||
Bundled_code_header tmp{blob_it, multi_arch_blob.end()};
|
||||
|
||||
if (!valid(tmp)) break;
|
||||
|
||||
for (auto&& bundle : bundles(tmp)) {
|
||||
impl.code_object_blobs.second[elf][triple_to_hsa_isa(bundle.triple)].push_back(bundle.blob);
|
||||
}
|
||||
|
||||
blob_it += tmp.bundled_code_size;
|
||||
};
|
||||
|
||||
return 0;
|
||||
}, this);
|
||||
});
|
||||
|
||||
return code_object_blobs.second;
|
||||
}
|
||||
|
||||
Symbol read_symbol(const ELFIO::symbol_section_accessor& section,
|
||||
unsigned int idx) {
|
||||
assert(idx < section.get_symbols_num());
|
||||
|
||||
Symbol r;
|
||||
section.get_symbol(
|
||||
idx, r.name, r.value, r.size, r.bind, r.type, r.sect_idx, r.other);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
const std::unordered_map<
|
||||
std::string,
|
||||
std::pair<ELFIO::Elf64_Addr, ELFIO::Elf_Xword>>& get_symbol_addresses() {
|
||||
|
||||
std::call_once(symbol_addresses.first, [this]() {
|
||||
dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void* psi_ptr) {
|
||||
|
||||
if (!psi_ptr)
|
||||
return 0;
|
||||
|
||||
program_state_impl* t = static_cast<program_state_impl*>(psi_ptr);
|
||||
|
||||
ELFIO::elfio tmp;
|
||||
const auto elf =
|
||||
info->dlpi_addr ? info->dlpi_name : "/proc/self/exe";
|
||||
|
||||
if (!tmp.load(elf)) return 0;
|
||||
|
||||
auto it = find_section_if(tmp, [](const ELFIO::section* x) {
|
||||
return x->get_type() == SHT_SYMTAB;
|
||||
});
|
||||
|
||||
if (!it) return 0;
|
||||
|
||||
const ELFIO::symbol_section_accessor symtab{tmp, it};
|
||||
|
||||
for (auto i = 0u; i != symtab.get_symbols_num(); ++i) {
|
||||
auto s = t->read_symbol(symtab, i);
|
||||
|
||||
if (s.type != STT_OBJECT || s.sect_idx == SHN_UNDEF) continue;
|
||||
|
||||
const auto addr = s.value + info->dlpi_addr;
|
||||
t->symbol_addresses.second.emplace(std::move(s.name), std::make_pair(addr, s.size));
|
||||
}
|
||||
|
||||
return 0;
|
||||
}, this);
|
||||
});
|
||||
|
||||
return symbol_addresses.second;
|
||||
}
|
||||
|
||||
std::unordered_map<std::string, void*>& get_globals() {
|
||||
std::call_once(std::get<0>(globals), [this]() {
|
||||
std::get<2>(globals).reserve(get_symbol_addresses().size());
|
||||
});
|
||||
return std::get<2>(globals);
|
||||
}
|
||||
|
||||
std::mutex& get_globals_mutex() {
|
||||
return std::get<1>(globals);
|
||||
}
|
||||
|
||||
std::vector<std::string> copy_names_of_undefined_symbols(
|
||||
const ELFIO::symbol_section_accessor& section) {
|
||||
std::vector<std::string> r;
|
||||
|
||||
for (auto i = 0u; i != section.get_symbols_num(); ++i) {
|
||||
// TODO: this is boyscout code, caching the temporaries
|
||||
// may be of worth.
|
||||
auto tmp = read_symbol(section, i);
|
||||
if (tmp.sect_idx != SHN_UNDEF || tmp.name.empty()) continue;
|
||||
|
||||
r.push_back(std::move(tmp.name));
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
void associate_code_object_symbols_with_host_allocation(
|
||||
const ELFIO::elfio& reader,
|
||||
ELFIO::section* code_object_dynsym,
|
||||
hsa_agent_t agent,
|
||||
hsa_executable_t executable) {
|
||||
if (!code_object_dynsym) return;
|
||||
|
||||
const auto undefined_symbols = copy_names_of_undefined_symbols(
|
||||
ELFIO::symbol_section_accessor{reader, code_object_dynsym});
|
||||
|
||||
auto& g = get_globals();
|
||||
auto& g_mutex = get_globals_mutex();
|
||||
for (auto&& x : undefined_symbols) {
|
||||
|
||||
if (g.find(x) != g.cend()) return;
|
||||
|
||||
const auto it1 = get_symbol_addresses().find(x);
|
||||
|
||||
if (it1 == get_symbol_addresses().cend()) {
|
||||
hip_throw(std::runtime_error{
|
||||
"Global symbol: " + x + " is undefined."});
|
||||
}
|
||||
|
||||
std::lock_guard<std::mutex> lck{g_mutex};
|
||||
|
||||
if (g.find(x) != g.cend()) return;
|
||||
|
||||
g.emplace(x, (void*)(it1->second.first));
|
||||
void* p = nullptr;
|
||||
hsa_amd_memory_lock(
|
||||
reinterpret_cast<void*>(it1->second.first),
|
||||
it1->second.second,
|
||||
nullptr, // All agents.
|
||||
0,
|
||||
&p);
|
||||
|
||||
hsa_executable_agent_global_variable_define(
|
||||
executable, agent, x.c_str(), p);
|
||||
}
|
||||
}
|
||||
|
||||
void load_code_object_and_freeze_executable(
|
||||
const std::string& file, hsa_agent_t agent, hsa_executable_t executable) {
|
||||
// TODO: the following sequence is inefficient, should be refactored
|
||||
// into a single load of the file and subsequent ELFIO
|
||||
// processing.
|
||||
if (file.empty()) return;
|
||||
|
||||
static const auto cor_deleter = [] (hsa_code_object_reader_t* p) {
|
||||
if (!p) return;
|
||||
hsa_code_object_reader_destroy(*p);
|
||||
delete p;
|
||||
};
|
||||
|
||||
RAII_code_reader tmp{new hsa_code_object_reader_t, cor_deleter};
|
||||
hsa_code_object_reader_create_from_memory(
|
||||
file.data(), file.size(), tmp.get());
|
||||
|
||||
hsa_executable_load_agent_code_object(
|
||||
executable, agent, *tmp, nullptr, nullptr);
|
||||
|
||||
hsa_executable_freeze(executable, nullptr);
|
||||
|
||||
std::lock_guard<std::mutex> lck{code_readers.first};
|
||||
code_readers.second.push_back(move(tmp));
|
||||
}
|
||||
|
||||
|
||||
const std::vector<hsa_executable_t>& get_executables(hsa_agent_t agent) {
|
||||
|
||||
if (executables.find(agent) == executables.cend()) {
|
||||
hip_throw(std::runtime_error{"invalid agent"});
|
||||
}
|
||||
|
||||
std::call_once(executables[agent].first, [this](hsa_agent_t aa) {
|
||||
auto data = std::make_pair(this, &aa);
|
||||
hsa_agent_iterate_isas(aa, [](hsa_isa_t x, void* d) {
|
||||
auto& p = *static_cast<decltype(data)*>(d);
|
||||
auto& impl = *(p.first);
|
||||
for (const auto code_object_it : impl.get_code_object_blobs()) {
|
||||
const auto elf = code_object_it.first;
|
||||
const auto code_object_blobs = code_object_it.second;
|
||||
const auto it = code_object_blobs.find(x);
|
||||
|
||||
if (it == code_object_blobs.cend()) continue;
|
||||
|
||||
hsa_agent_t a = *static_cast<hsa_agent_t*>(p.second);
|
||||
|
||||
std::lock_guard<std::mutex> lck{executables_cache_mutex};
|
||||
|
||||
std::vector<hsa_executable_t>& current_exes =
|
||||
hip_impl::executables_cache(elf, x, a);
|
||||
// check the cache for already loaded executables
|
||||
if (current_exes.empty()) {
|
||||
// executables do not yet exist for this elf+isa+agent, create and cache them
|
||||
for (auto&& blob : it->second) {
|
||||
hsa_executable_t tmp = {};
|
||||
|
||||
hsa_executable_create_alt(
|
||||
HSA_PROFILE_FULL,
|
||||
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
|
||||
nullptr,
|
||||
&tmp);
|
||||
|
||||
// TODO: this is massively inefficient and only meant for
|
||||
// illustration.
|
||||
tmp = impl.load_executable(blob.data(), blob.size(), tmp, a);
|
||||
|
||||
if (tmp.handle) current_exes.push_back(tmp);
|
||||
}
|
||||
}
|
||||
// append cached executables to our agent's vector of executables
|
||||
impl.executables[a].second.insert(impl.executables[a].second.end(),
|
||||
current_exes.begin(), current_exes.end());
|
||||
}
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}, &data);
|
||||
}, agent);
|
||||
|
||||
return executables[agent].second;
|
||||
}
|
||||
|
||||
hsa_executable_t load_executable(const char* data,
|
||||
const size_t data_size,
|
||||
hsa_executable_t executable,
|
||||
hsa_agent_t agent) {
|
||||
ELFIO::elfio reader;
|
||||
std::string ts = std::string(data, data_size);
|
||||
std::stringstream tmp{ts};
|
||||
|
||||
if (!reader.load(tmp)) return hsa_executable_t{};
|
||||
const auto code_object_dynsym = find_section_if(
|
||||
reader, [](const ELFIO::section* x) {
|
||||
return x->get_type() == SHT_DYNSYM;
|
||||
});
|
||||
|
||||
associate_code_object_symbols_with_host_allocation(reader,
|
||||
code_object_dynsym,
|
||||
agent, executable);
|
||||
|
||||
load_code_object_and_freeze_executable(ts, agent, executable);
|
||||
|
||||
return executable;
|
||||
}
|
||||
|
||||
std::vector<std::pair<std::uintptr_t, std::string>> function_names_for(
|
||||
const ELFIO::elfio& reader, ELFIO::section* symtab) {
|
||||
std::vector<std::pair<std::uintptr_t, std::string>> r;
|
||||
ELFIO::symbol_section_accessor symbols{reader, symtab};
|
||||
|
||||
for (auto i = 0u; i != symbols.get_symbols_num(); ++i) {
|
||||
// TODO: this is boyscout code, caching the temporaries
|
||||
// may be of worth.
|
||||
auto tmp = read_symbol(symbols, i);
|
||||
|
||||
if (tmp.type != STT_FUNC) continue;
|
||||
if (tmp.type == SHN_UNDEF) continue;
|
||||
if (tmp.name.empty()) continue;
|
||||
|
||||
r.emplace_back(tmp.value, tmp.name);
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
const std::unordered_map<std::uintptr_t, std::string>& get_function_names() {
|
||||
|
||||
std::call_once(function_names.first, [this]() {
|
||||
dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void* p) {
|
||||
ELFIO::elfio tmp;
|
||||
const auto elf =
|
||||
info->dlpi_addr ? info->dlpi_name : "/proc/self/exe";
|
||||
|
||||
if (!tmp.load(elf)) return 0;
|
||||
|
||||
const auto it = find_section_if(tmp, [](const ELFIO::section* x) {
|
||||
return x->get_type() == SHT_SYMTAB;
|
||||
});
|
||||
|
||||
if (!it) return 0;
|
||||
|
||||
auto& impl = *static_cast<program_state_impl*>(p);
|
||||
|
||||
auto names = impl.function_names_for(tmp, it);
|
||||
for (auto&& x : names) x.first += info->dlpi_addr;
|
||||
|
||||
impl.function_names.second.insert(
|
||||
std::make_move_iterator(names.begin()),
|
||||
std::make_move_iterator(names.end()));
|
||||
|
||||
return 0;
|
||||
}, this);
|
||||
});
|
||||
|
||||
return function_names.second;
|
||||
}
|
||||
|
||||
const std::unordered_map<
|
||||
std::string, std::vector<hsa_executable_symbol_t>>& get_kernels(hsa_agent_t agent) {
|
||||
|
||||
if (kernels.find(agent) == kernels.cend()) {
|
||||
hip_throw(std::runtime_error{"invalid agent"});
|
||||
}
|
||||
|
||||
std::call_once(kernels[agent].first, [this](hsa_agent_t aa) {
|
||||
static const auto copy_kernels = [](
|
||||
hsa_executable_t, hsa_agent_t a, hsa_executable_symbol_t x, void* p) {
|
||||
auto& impl = *static_cast<program_state_impl*>(p);
|
||||
if (type(x) == HSA_SYMBOL_KIND_KERNEL) impl.kernels[a].second[hip_impl::name(x)].push_back(x);
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
};
|
||||
|
||||
for (auto&& executable : get_executables(aa)) {
|
||||
hsa_executable_iterate_agent_symbols(
|
||||
executable, aa, copy_kernels, this);
|
||||
}
|
||||
}, agent);
|
||||
|
||||
return kernels[agent].second;
|
||||
}
|
||||
|
||||
const std::unordered_map<
|
||||
std::uintptr_t,
|
||||
Kernel_descriptor>& get_functions(hsa_agent_t agent) {
|
||||
|
||||
if (functions.find(agent) == functions.cend()) {
|
||||
hip_throw(std::runtime_error{"invalid agent"});
|
||||
}
|
||||
|
||||
std::call_once(functions[agent].first, [this](hsa_agent_t aa) {
|
||||
for (auto&& function : get_function_names()) {
|
||||
const auto it = get_kernels(aa).find(function.second);
|
||||
|
||||
if (it == get_kernels(aa).cend()) continue;
|
||||
|
||||
for (auto&& kernel_symbol : it->second) {
|
||||
functions[aa].second.emplace(
|
||||
function.first,
|
||||
Kernel_descriptor{kernel_object(kernel_symbol), it->first});
|
||||
}
|
||||
}
|
||||
}, agent);
|
||||
|
||||
return functions[agent].second;
|
||||
}
|
||||
|
||||
std::size_t parse_args(
|
||||
const std::string& metadata,
|
||||
std::size_t f,
|
||||
std::size_t l,
|
||||
std::vector<std::pair<std::size_t, std::size_t>>& size_align) {
|
||||
if (f == l) return f;
|
||||
if (!size_align.empty()) return l;
|
||||
|
||||
do {
|
||||
static constexpr size_t size_sz{5};
|
||||
f = metadata.find("Size:", f) + size_sz;
|
||||
|
||||
if (l <= f) return f;
|
||||
|
||||
auto size = std::strtoul(&metadata[f], nullptr, 10);
|
||||
|
||||
static constexpr size_t align_sz{6};
|
||||
f = metadata.find("Align:", f) + align_sz;
|
||||
|
||||
char* l{};
|
||||
auto align = std::strtoul(&metadata[f], &l, 10);
|
||||
|
||||
f += (l - &metadata[f]) + 1;
|
||||
|
||||
size_align.emplace_back(size, align);
|
||||
} while (true);
|
||||
}
|
||||
|
||||
void read_kernarg_metadata(
|
||||
ELFIO::elfio& reader,
|
||||
std::unordered_map<
|
||||
std::string,
|
||||
std::vector<std::pair<std::size_t, std::size_t>>>& kernargs) {
|
||||
// TODO: this is inefficient.
|
||||
auto it = find_section_if(reader, [](const ELFIO::section* x) {
|
||||
return x->get_type() == SHT_NOTE;
|
||||
});
|
||||
|
||||
if (!it) return;
|
||||
|
||||
const ELFIO::note_section_accessor acc{reader, it};
|
||||
for (decltype(acc.get_notes_num()) i = 0; i != acc.get_notes_num(); ++i) {
|
||||
ELFIO::Elf_Word type{};
|
||||
std::string name{};
|
||||
void* desc{};
|
||||
ELFIO::Elf_Word desc_size{};
|
||||
|
||||
acc.get_note(i, type, name, desc, desc_size);
|
||||
|
||||
if (name != "AMD") continue; // TODO: switch to using NT_AMD_AMDGPU_HSA_METADATA.
|
||||
|
||||
std::string tmp{
|
||||
static_cast<char*>(desc), static_cast<char*>(desc) + desc_size};
|
||||
|
||||
auto dx = tmp.find("Kernels:");
|
||||
|
||||
if (dx == std::string::npos) continue;
|
||||
|
||||
static constexpr decltype(tmp.size()) kernels_sz{8};
|
||||
dx += kernels_sz;
|
||||
|
||||
do {
|
||||
dx = tmp.find("Name:", dx);
|
||||
|
||||
if (dx == std::string::npos) break;
|
||||
|
||||
static constexpr decltype(tmp.size()) name_sz{5};
|
||||
dx = tmp.find_first_not_of(" '", dx + name_sz);
|
||||
|
||||
auto fn = tmp.substr(dx, tmp.find_first_of("'\n", dx) - dx);
|
||||
dx += fn.size();
|
||||
|
||||
auto dx1 = tmp.find("CodeProps", dx);
|
||||
dx = tmp.find("Args:", dx);
|
||||
|
||||
if (dx1 < dx) {
|
||||
dx = dx1;
|
||||
continue;
|
||||
}
|
||||
if (dx == std::string::npos) break;
|
||||
|
||||
static constexpr decltype(tmp.size()) args_sz{5};
|
||||
dx = parse_args(tmp, dx + args_sz, dx1, kernargs[fn]);
|
||||
} while (true);
|
||||
}
|
||||
}
|
||||
|
||||
const std::unordered_map<std::string,
|
||||
std::vector<std::pair<std::size_t, std::size_t>>>& get_kernargs() {
|
||||
|
||||
std::call_once(kernargs.first, [this]() {
|
||||
for (auto&& name_and_isa_blobs : get_code_object_blobs()) {
|
||||
for (auto&& isa_blobs : name_and_isa_blobs.second) {
|
||||
for (auto&& blob : isa_blobs.second) {
|
||||
std::stringstream tmp{std::string{blob.cbegin(), blob.cend()}};
|
||||
|
||||
ELFIO::elfio reader;
|
||||
|
||||
if (!reader.load(tmp)) continue;
|
||||
|
||||
read_kernarg_metadata(reader, kernargs.second);
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return kernargs.second;
|
||||
}
|
||||
|
||||
std::string name(std::uintptr_t function_address)
|
||||
{
|
||||
const auto it = get_function_names().find(function_address);
|
||||
|
||||
if (it == get_function_names().cend()) {
|
||||
hip_throw(std::runtime_error{
|
||||
"Invalid function passed to hipLaunchKernelGGL."});
|
||||
}
|
||||
|
||||
return it->second;
|
||||
}
|
||||
|
||||
std::string name(hsa_agent_t agent)
|
||||
{
|
||||
char n[64]{};
|
||||
hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, n);
|
||||
|
||||
return std::string{n};
|
||||
}
|
||||
|
||||
const Kernel_descriptor& kernel_descriptor(std::uintptr_t function_address,
|
||||
hsa_agent_t agent) {
|
||||
|
||||
auto it0 = get_functions(agent).find(function_address);
|
||||
|
||||
if (it0 == get_functions(agent).cend()) {
|
||||
hip_throw(std::runtime_error{
|
||||
"No device code available for function: " +
|
||||
std::string(name(function_address)) +
|
||||
", for agent: " + name(agent)});
|
||||
}
|
||||
|
||||
return it0->second;
|
||||
}
|
||||
|
||||
const std::vector<std::pair<std::size_t, std::size_t>>&
|
||||
kernargs_size_align(std::uintptr_t kernel) {
|
||||
|
||||
auto it = get_function_names().find(kernel);
|
||||
if (it == get_function_names().cend()) {
|
||||
hip_throw(std::runtime_error{"Undefined __global__ function."});
|
||||
}
|
||||
|
||||
auto it1 = get_kernargs().find(it->second);
|
||||
if (it1 == get_kernargs().end()) {
|
||||
hip_throw(std::runtime_error{
|
||||
"Missing metadata for __global__ function: " + it->second});
|
||||
}
|
||||
|
||||
return it1->second;
|
||||
}
|
||||
}; // class program_state_impl
|
||||
|
||||
};
|
||||
Verwijs in nieuw issue
Block a user