* Initial attempt to switch over to internally linked state.

* Add missing CMake update.

* hipLaunchKernelGGLImpl must be inline as well. Ensure internal linkage.

* Ensure global retrieval uses internally linked state.

* Hide HC in the implementation. Minimise ADL woes.

* Strange software exists, and must be catered to.

* Use a less spammy mechanism for ensuring internal linkage / non-export.

* Remove leftover internal detail.
Этот коммит содержится в:
Alex Voicu
2019-03-06 14:01:44 +02:00
коммит произвёл Maneesh Gupta
родитель 2d67bc5e9c
Коммит ea0fcf3e61
24 изменённых файлов: 1018 добавлений и 1069 удалений
-1
Просмотреть файл
@@ -240,7 +240,6 @@ if(HIP_PLATFORM STREQUAL "hcc")
src/hip_surface.cpp
src/hip_intercept.cpp
src/env.cpp
src/program_state.cpp
src/h2f.cpp)
execute_process(COMMAND ${HCC_HOME}/bin/hcc-config --ldflags OUTPUT_VARIABLE HCC_LD_FLAGS)
Просмотреть файл
Просмотреть файл
Просмотреть файл
Просмотреть файл
Просмотреть файл
Просмотреть файл
Просмотреть файл
Просмотреть файл
Просмотреть файл
Просмотреть файл
Просмотреть файл
Просмотреть файл
+78 -21
Просмотреть файл
@@ -31,6 +31,7 @@ THE SOFTWARE.
#include "hip/hip_hcc.h"
#include "hip_runtime.h"
#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <cstring>
@@ -105,21 +106,13 @@ inline std::vector<std::uint8_t> make_kernarg(
auto it = function_names().find(reinterpret_cast<std::uintptr_t>(kernel));
if (it == function_names().cend()) {
it =
function_names(true).find(reinterpret_cast<std::uintptr_t>(kernel));
if (it == function_names().cend()) {
throw std::runtime_error{"Undefined __global__ function."};
}
hip_throw(std::runtime_error{"Undefined __global__ function."});
}
auto it1 = kernargs().find(it->second);
if (it1 == kernargs().end()) {
it1 = kernargs(true).find(it->second);
if (it1 == kernargs().end()) {
throw std::runtime_error{
"Missing metadata for __global__ function: " + it->second};
}
hip_throw(std::runtime_error{
"Missing metadata for __global__ function: " + it->second});
}
std::tuple<Formals...> to_formals{std::move(actuals)};
@@ -129,23 +122,87 @@ inline std::vector<std::uint8_t> make_kernarg(
return make_kernarg<0>(to_formals, it1->second, std::move(kernarg));
}
void hipLaunchKernelGGLImpl(std::uintptr_t function_address, const dim3& numBlocks,
const dim3& dimBlocks, std::uint32_t sharedMemBytes, hipStream_t stream,
void** kernarg);
} // Namespace hip_impl.
inline
std::string name(std::uintptr_t function_address)
{
const auto it = function_names().find(function_address);
if (it == function_names().cend()) {
hip_throw(std::runtime_error{
"Invalid function passed to hipLaunchKernelGGL."});
}
return it->second;
}
inline
std::string name(hsa_agent_t agent)
{
char n[64]{};
hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, n);
return std::string{n};
}
hsa_agent_t target_agent(hipStream_t stream);
inline
__attribute__((visibility("hidden")))
void hipLaunchKernelGGLImpl(
std::uintptr_t function_address,
const dim3& numBlocks,
const dim3& dimBlocks,
std::uint32_t sharedMemBytes,
hipStream_t stream,
void** kernarg) {
auto it0 = functions().find(function_address);
if (it0 == functions().cend()) {
hip_throw(std::runtime_error{
"No device code available for function: " +
name(function_address)});
}
auto agent = target_agent(stream);
const auto it1 = std::find_if(
it0->second.cbegin(),
it0->second.cend(),
[=](const std::pair<hsa_agent_t, Kernel_descriptor>& x) {
return x.first == agent;
});
if (it1 == it0->second.cend()) {
hip_throw(std::runtime_error{
"No code available for function: " + name(function_address) +
", for agent: " + name(agent)});
}
hipModuleLaunchKernel(it1->second, numBlocks.x, numBlocks.y, numBlocks.z,
dimBlocks.x, dimBlocks.y, dimBlocks.z, sharedMemBytes,
stream, nullptr, kernarg);
}
} // Namespace hip_impl.
template <typename... Args, typename F = void (*)(Args...)>
inline void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
std::uint32_t sharedMemBytes, hipStream_t stream, Args... args) {
inline
void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
std::uint32_t sharedMemBytes, hipStream_t stream,
Args... args) {
auto kernarg = hip_impl::make_kernarg(
kernel, std::tuple<Args...>{std::move(args)...});
std::size_t kernarg_size = kernarg.size();
void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, kernarg.data(), HIP_LAUNCH_PARAM_BUFFER_SIZE,
&kernarg_size, HIP_LAUNCH_PARAM_END};
void* config[]{
HIP_LAUNCH_PARAM_BUFFER_POINTER,
kernarg.data(),
HIP_LAUNCH_PARAM_BUFFER_SIZE,
&kernarg_size,
HIP_LAUNCH_PARAM_END};
hip_impl::hipLaunchKernelGGLImpl(reinterpret_cast<std::uintptr_t>(kernel), numBlocks, dimBlocks,
sharedMemBytes, stream, &config[0]);
hip_impl::hipLaunchKernelGGLImpl(reinterpret_cast<std::uintptr_t>(kernel),
numBlocks, dimBlocks, sharedMemBytes,
stream, &config[0]);
}
template <typename... Args, typename F = void (*)(hipLaunchParm, Args...)>
+240 -34
Просмотреть файл
@@ -35,11 +35,14 @@ THE SOFTWARE.
#define GENERIC_GRID_LAUNCH 1
#endif
#include <hsa/hsa.h>
#include <hip/hcc_detail/host_defines.h>
#include <hip/hip_runtime_api.h>
#include <hip/hcc_detail/driver_types.h>
#include <hip/hcc_detail/hip_texture_types.h>
#include <hip/hcc_detail/hip_surface_types.h>
#include <hip/hcc_detail/program_state.hpp>
#if defined(_MSC_VER)
#define DEPRECATED(msg) __declspec(deprecated(msg))
@@ -58,6 +61,11 @@ THE SOFTWARE.
#define HIP_LAUNCH_PARAM_END ((void*)0x03)
#ifdef __cplusplus
#include <mutex>
#include <string>
#include <unordered_map>
#include <vector>
#define __dparm(x) \
= x
#else
@@ -1363,6 +1371,61 @@ hipError_t hipMemcpyDtoHAsync(void* dst, hipDeviceptr_t src, size_t sizeBytes, h
hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes,
hipStream_t stream);
__attribute__((visibility("hidden")))
hipError_t hipModuleGetGlobal(void**, size_t*, hipModule_t, const char*);
/**
* @brief Copies the memory address of symbol @p symbolName to @p devPtr
*
* @param[in] symbolName - Symbol on device
* @param[out] devPtr - Pointer to a pointer to the memory referred to by the symbol
* @return #hipSuccess, #hipErrorNotInitialized, #hipErrorNotFound
*
* @see hipGetSymbolSize, hipMemcpyToSymbol, hipMemcpyFromSymbol, hipMemcpyToSymbolAsync,
* hipMemcpyFromSymbolAsync
*/
inline
__attribute__((visibility("hidden")))
hipError_t hipGetSymbolAddress(void** devPtr, const void* symbolName) {
//HIP_INIT_API(hipGetSymbolAddress, devPtr, symbolName);
size_t size = 0;
return hipModuleGetGlobal(devPtr, &size, 0, (const char*)symbolName);
}
/**
* @brief Copies the size of symbol @p symbolName to @p size
*
* @param[in] symbolName - Symbol on device
* @param[out] size - Pointer to the size of the symbol
* @return #hipSuccess, #hipErrorNotInitialized, #hipErrorNotFound
*
* @see hipGetSymbolSize, hipMemcpyToSymbol, hipMemcpyFromSymbol, hipMemcpyToSymbolAsync,
* hipMemcpyFromSymbolAsync
*/
inline
__attribute__((visibility("hidden")))
hipError_t hipGetSymbolSize(size_t* size, const void* symbolName) {
// HIP_INIT_API(hipGetSymbolSize, size, symbolName);
void* devPtr = nullptr;
return hipModuleGetGlobal(&devPtr, size, 0, (const char*)symbolName);
}
#if defined(__cplusplus)
} // extern "C"
#endif
namespace hip_impl {
hipError_t hipMemcpyToSymbol(void*, const void*, size_t, size_t, hipMemcpyKind,
const char*);
} // Namespace hip_impl.
#if defined(__cplusplus)
extern "C" {
#endif
/**
* @brief Copies @p sizeBytes bytes from the memory area pointed to by @p src to the memory area
@@ -1387,35 +1450,36 @@ hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dst, hipDeviceptr_t src, size_t siz
* hipMemcpyFromArrayAsync, hipMemcpy2DFromArrayAsync, hipMemcpyToSymbolAsync,
* hipMemcpyFromSymbolAsync
*/
hipError_t hipMemcpyToSymbol(const void* symbolName, const void* src, size_t sizeBytes,
size_t offset __dparm(0), hipMemcpyKind kind __dparm(hipMemcpyHostToDevice));
inline
__attribute__((visibility("hidden")))
hipError_t hipMemcpyToSymbol(const void* symbolName, const void* src,
size_t sizeBytes, size_t offset __dparm(0),
hipMemcpyKind kind __dparm(hipMemcpyHostToDevice)) {
if (!symbolName) return hipErrorInvalidSymbol;
hipDeviceptr_t dst = NULL;
hipGetSymbolAddress(&dst, (const char*)symbolName);
/**
* @brief Copies the memory address of symbol @p symbolName to @p devPtr
*
* @param[in] symbolName - Symbol on device
* @param[out] devPtr - Pointer to a pointer to the memory referred to by the symbol
* @return #hipSuccess, #hipErrorNotInitialized, #hipErrorNotFound
*
* @see hipGetSymbolSize, hipMemcpyToSymbol, hipMemcpyFromSymbol, hipMemcpyToSymbolAsync,
* hipMemcpyFromSymbolAsync
*/
hipError_t hipGetSymbolAddress(void** devPtr, const void* symbolName);
return hip_impl::hipMemcpyToSymbol(dst, src, sizeBytes, offset, kind,
(const char*)symbolName);
}
#if defined(__cplusplus)
} // extern "C"
#endif
/**
* @brief Copies the size of symbol @p symbolName to @p size
*
* @param[in] symbolName - Symbol on device
* @param[out] size - Pointer to the size of the symbol
* @return #hipSuccess, #hipErrorNotInitialized, #hipErrorNotFound
*
* @see hipGetSymbolSize, hipMemcpyToSymbol, hipMemcpyFromSymbol, hipMemcpyToSymbolAsync,
* hipMemcpyFromSymbolAsync
*/
hipError_t hipGetSymbolSize(size_t* size, const void* symbolName);
namespace hip_impl {
hipError_t hipMemcpyToSymbolAsync(void*, const void*, size_t, size_t,
hipMemcpyKind, hipStream_t, const char*);
hipError_t hipMemcpyFromSymbol(void*, const void*, size_t, size_t,
hipMemcpyKind, const char*);
hipError_t hipMemcpyFromSymbolAsync(void*, const void*, size_t, size_t,
hipMemcpyKind, hipStream_t, const char*);
} // Namespace hip_impl.
#if defined(__cplusplus)
extern "C" {
#endif
/**
* @brief Copies @p sizeBytes bytes from the memory area pointed to by @p src to the memory area
@@ -1442,14 +1506,50 @@ hipError_t hipGetSymbolSize(size_t* size, const void* symbolName);
* hipMemcpyFromArrayAsync, hipMemcpy2DFromArrayAsync, hipMemcpyToSymbolAsync,
* hipMemcpyFromSymbolAsync
*/
hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void* src, size_t sizeBytes,
size_t offset, hipMemcpyKind kind, hipStream_t stream __dparm(0));
inline
__attribute__((visibility("hidden")))
hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void* src,
size_t sizeBytes, size_t offset,
hipMemcpyKind kind, hipStream_t stream __dparm(0)) {
if (!symbolName) return hipErrorInvalidSymbol;
hipError_t hipMemcpyFromSymbol(void* dst, const void* symbolName, size_t sizeBytes,
size_t offset __dparm(0), hipMemcpyKind kind __dparm( hipMemcpyDeviceToHost ));
hipDeviceptr_t dst = NULL;
hipGetSymbolAddress(&dst, symbolName);
hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName, size_t sizeBytes,
size_t offset, hipMemcpyKind kind, hipStream_t stream __dparm(0));
return hip_impl::hipMemcpyToSymbolAsync(dst, src, sizeBytes, offset, kind,
stream,
(const char*)symbolName);
}
inline
__attribute__((visibility("hidden")))
hipError_t hipMemcpyFromSymbol(void* dst, const void* symbolName,
size_t sizeBytes, size_t offset __dparm(0),
hipMemcpyKind kind __dparm(hipMemcpyDeviceToHost)) {
if (!symbolName) return hipErrorInvalidSymbol;
hipDeviceptr_t src = NULL;
hipGetSymbolAddress(&src, symbolName);
return hip_impl::hipMemcpyFromSymbol(dst, src, sizeBytes, offset, kind,
(const char*)symbolName);
}
inline
__attribute__((visibility("hidden")))
hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName,
size_t sizeBytes, size_t offset,
hipMemcpyKind kind,
hipStream_t stream __dparm(0)) {
if (!symbolName) return hipErrorInvalidSymbol;
hipDeviceptr_t src = NULL;
hipGetSymbolAddress(&src, symbolName);
return hip_impl::hipMemcpyFromSymbolAsync(dst, src, sizeBytes, offset, kind,
stream,
(const char*)symbolName);
}
/**
* @brief Copy data from src to dst asynchronously.
@@ -2397,6 +2497,103 @@ hipError_t hipModuleGetFunction(hipFunction_t* function, hipModule_t module, con
hipError_t hipFuncGetAttributes(hipFuncAttributes* attr, const void* func);
struct Agent_global {
std::string name;
hipDeviceptr_t address;
uint32_t byte_cnt;
};
#if defined(__cplusplus)
} // extern "C"
#endif
namespace hip_impl {
hsa_executable_t executable_for(hipModule_t);
const std::string& hash_for(hipModule_t);
template<typename ForwardIterator>
std::pair<hipDeviceptr_t, std::size_t> read_global_description(
ForwardIterator f, ForwardIterator l, const char* name) {
const auto it = std::find_if(f, l, [=](const Agent_global& x) {
return x.name == name;
});
return it == l ?
std::make_pair(nullptr, 0u) : std::make_pair(it->address, it->byte_cnt);
}
std::vector<Agent_global> read_agent_globals(hsa_agent_t agent,
hsa_executable_t executable);
hsa_agent_t this_agent();
inline
__attribute__((visibility("hidden")))
hipError_t read_agent_global_from_module(hipDeviceptr_t* dptr, size_t* bytes,
hipModule_t hmod, const char* name) {
// the key of the map would the hash of code object associated with the
// hipModule_t instance
static std::unordered_map<
std::string, std::vector<Agent_global>> agent_globals;
auto key = hash_for(hmod);
if (agent_globals.count(key) == 0) {
static std::mutex mtx;
std::lock_guard<std::mutex> lck{mtx};
if (agent_globals.count(key) == 0) {
agent_globals.emplace(
key, read_agent_globals(this_agent(), executable_for(hmod)));
}
}
const auto it0 = agent_globals.find(key);
if (it0 == agent_globals.cend()) {
hip_throw(
std::runtime_error{"agent_globals data structure corrupted."});
}
std::tie(*dptr, *bytes) = read_global_description(it0->second.cbegin(),
it0->second.cend(), name);
return *dptr ? hipSuccess : hipErrorNotFound;
}
inline
__attribute__((visibility("hidden")))
hipError_t read_agent_global_from_process(hipDeviceptr_t* dptr, size_t* bytes,
const char* name) {
static std::unordered_map<
hsa_agent_t, std::vector<Agent_global>> agent_globals;
static std::once_flag f;
std::call_once(f, []() {
for (auto&& agent_executables : executables()) {
std::vector<Agent_global> tmp0;
for (auto&& executable : agent_executables.second) {
auto tmp1 = read_agent_globals(agent_executables.first,
executable);
tmp0.insert(tmp0.end(), make_move_iterator(tmp1.begin()),
make_move_iterator(tmp1.end()));
}
agent_globals.emplace(agent_executables.first, move(tmp0));
}
});
const auto it = agent_globals.find(this_agent());
if (it == agent_globals.cend()) return hipErrorNotInitialized;
std::tie(*dptr, *bytes) = read_global_description(it->second.cbegin(),
it->second.cend(), name);
return *dptr ? hipSuccess : hipErrorNotFound;
}
} // Namespace hip_impl.
#if defined(__cplusplus)
extern "C" {
#endif
/**
* @brief returns device memory pointer and size of the kernel present in the module with symbol @p
* name
@@ -2408,11 +2605,20 @@ hipError_t hipFuncGetAttributes(hipFuncAttributes* attr, const void* func);
*
* @returns hipSuccess, hipErrorInvalidValue, hipErrorNotInitialized
*/
hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes, hipModule_t hmod,
const char* name);
inline
__attribute__((visibility("hidden")))
hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes,
hipModule_t hmod, const char* name) {
if (!dptr || !bytes) return hipErrorInvalidValue;
hipError_t ihipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes, hipModule_t hmod,
const char* name);
if (!name) return hipErrorNotInitialized;
const auto r = hmod ?
hip_impl::read_agent_global_from_module(dptr, bytes, hmod, name) :
hip_impl::read_agent_global_from_process(dptr, bytes, name);
return r;
}
hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const char* name);
/**
-9
Просмотреть файл
@@ -27,15 +27,6 @@ THE SOFTWARE.
#include <functional>
#include <string>
inline constexpr bool operator==(hsa_isa_t x, hsa_isa_t y) { return x.handle == y.handle; }
namespace std {
template <>
struct hash<hsa_isa_t> {
size_t operator()(hsa_isa_t x) const { return hash<decltype(x.handle)>{}(x.handle); }
};
} // namespace std
namespace hip_impl {
inline void* address(hsa_executable_symbol_t x) {
void* r = nullptr;
+2 -2
Просмотреть файл
@@ -85,7 +85,7 @@ requires(Domain<K> ==
hc::parallel_for_each(acc_v, d, k);
} catch (std::exception& ex) {
std::cerr << "Failed in " << __func__ << ", with exception: " << ex.what() << std::endl;
throw;
hip_throw(ex);
}
}
@@ -113,7 +113,7 @@ requires(Domain<K> == {Ts...}) inline void grid_launch_hip_impl_(New_grid_launch
group_mem_bytes, acc_v, std::move(k));
} catch (std::exception& ex) {
std::cerr << "Failed in " << __func__ << ", with exception: " << ex.what() << std::endl;
throw;
hip_throw(ex);
}
}
+550 -14
Просмотреть файл
@@ -22,14 +22,35 @@ THE SOFTWARE.
#pragma once
#include "code_object_bundle.hpp"
#include "hsa_helpers.hpp"
#if !defined(__cpp_exceptions)
#define try if (true)
#define catch(...) if (false)
#endif
#include "elfio/elfio.hpp"
#if !defined(__cpp_exceptions)
#undef try
#undef catch
#endif
#include <hsa/amd_hsa_kernel_code.h>
#include <hsa/hsa.h>
#include <hsa/hsa_ext_amd.h>
#include <hsa/hsa_ven_amd_loader.h>
#include <link.h>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include <istream>
#include <memory>
#include <mutex>
#include <sstream>
#include <stdexcept>
#include <string>
#include <unordered_map>
#include <utility>
@@ -39,13 +60,27 @@ struct ihipModuleSymbol_t;
using hipFunction_t = ihipModuleSymbol_t*;
namespace std {
template <>
template<>
struct hash<hsa_agent_t> {
size_t operator()(hsa_agent_t x) const { return hash<decltype(x.handle)>{}(x.handle); }
size_t operator()(hsa_agent_t x) const {
return hash<decltype(x.handle)>{}(x.handle);
}
};
template<>
struct hash<hsa_isa_t> {
size_t operator()(hsa_isa_t x) const {
return hash<decltype(x.handle)>{}(x.handle);
}
};
} // namespace std
inline constexpr bool operator==(hsa_agent_t x, hsa_agent_t y) { return x.handle == y.handle; }
inline constexpr bool operator==(hsa_agent_t x, hsa_agent_t y) {
return x.handle == y.handle;
}
inline constexpr bool operator==(hsa_isa_t x, hsa_isa_t y) {
return x.handle == y.handle;
}
namespace hip_impl {
class Kernel_descriptor {
@@ -93,16 +128,517 @@ public:
}
};
const std::unordered_map<hsa_agent_t, std::vector<hsa_executable_t>>& executables(
bool rebuild = false);
const std::unordered_map<std::uintptr_t, std::vector<std::pair<hsa_agent_t, Kernel_descriptor>>>&
functions(bool rebuild = false);
const std::unordered_map<std::uintptr_t, std::string>& function_names(bool rebuild = false);
std::unordered_map<std::string, void*>& globals(bool rebuild = false);
const std::unordered_map<
std::string, std::vector<std::pair<std::size_t, std::size_t>>>&
kernargs(bool rebuild = false);
template<typename P>
inline
ELFIO::section* find_section_if(ELFIO::elfio& reader, P p) {
const auto it = std::find_if(
reader.sections.begin(), reader.sections.end(), std::move(p));
hsa_executable_t load_executable(const std::string& file, hsa_executable_t executable,
hsa_agent_t agent);
return it != reader.sections.end() ? *it : nullptr;
}
inline
__attribute__((visibility("hidden")))
const std::unordered_map<
hsa_isa_t, std::vector<std::vector<char>>>& code_object_blobs() {
static std::unordered_map<hsa_isa_t, std::vector<std::vector<char>>> r;
static std::once_flag f;
std::call_once(f, []() {
static std::vector<std::vector<char>> blobs{};
dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void*) {
ELFIO::elfio tmp;
const auto elf =
info->dlpi_addr ? info->dlpi_name : "/proc/self/exe";
if (!tmp.load(elf)) return 0;
const auto it = find_section_if(tmp, [](const ELFIO::section* x) {
return x->get_name() == ".kernel";
});
if (!it) return 0;
blobs.emplace_back(it->get_data(), it->get_data() + it->get_size());
return 0;
}, nullptr);
for (auto&& multi_arch_blob : blobs) {
auto it = multi_arch_blob.begin();
while (it != multi_arch_blob.end()) {
Bundled_code_header tmp{it, multi_arch_blob.end()};
if (!valid(tmp)) break;
for (auto&& bundle : bundles(tmp)) {
r[triple_to_hsa_isa(bundle.triple)].push_back(bundle.blob);
}
it += tmp.bundled_code_size;
};
}
});
return r;
}
struct Symbol {
std::string name;
ELFIO::Elf64_Addr value = 0;
ELFIO::Elf_Xword size = 0;
ELFIO::Elf_Half sect_idx = 0;
std::uint8_t bind = 0;
std::uint8_t type = 0;
std::uint8_t other = 0;
};
inline
Symbol read_symbol(const ELFIO::symbol_section_accessor& section,
unsigned int idx) {
assert(idx < section.get_symbols_num());
Symbol r;
section.get_symbol(
idx, r.name, r.value, r.size, r.bind, r.type, r.sect_idx, r.other);
return r;
}
inline
__attribute__((visibility("hidden")))
const std::unordered_map<
std::string,
std::pair<ELFIO::Elf64_Addr, ELFIO::Elf_Xword>>& symbol_addresses() {
static std::unordered_map<
std::string, std::pair<ELFIO::Elf64_Addr, ELFIO::Elf_Xword>> r;
static std::once_flag f;
std::call_once(f, []() {
dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void*) {
ELFIO::elfio tmp;
const auto elf =
info->dlpi_addr ? info->dlpi_name : "/proc/self/exe";
if (!tmp.load(elf)) return 0;
auto it = find_section_if(tmp, [](const ELFIO::section* x) {
return x->get_type() == SHT_SYMTAB;
});
if (!it) return 0;
const ELFIO::symbol_section_accessor symtab{tmp, it};
for (auto i = 0u; i != symtab.get_symbols_num(); ++i) {
auto s = read_symbol(symtab, i);
if (s.type != STT_OBJECT || s.sect_idx == SHN_UNDEF) continue;
const auto addr = s.value + info->dlpi_addr;
r.emplace(std::move(s.name), std::make_pair(addr, s.size));
}
return 0;
}, nullptr);
});
return r;
}
inline
__attribute__((visibility("hidden")))
std::unordered_map<std::string, void*>& globals() {
static std::unordered_map<std::string, void*> r;
static std::once_flag f;
std::call_once(f, []() { r.reserve(symbol_addresses().size()); });
return r;
}
inline
std::vector<std::string> copy_names_of_undefined_symbols(
const ELFIO::symbol_section_accessor& section) {
std::vector<std::string> r;
for (auto i = 0u; i != section.get_symbols_num(); ++i) {
// TODO: this is boyscout code, caching the temporaries
// may be of worth.
auto tmp = read_symbol(section, i);
if (tmp.sect_idx != SHN_UNDEF || tmp.name.empty()) continue;
r.push_back(std::move(tmp.name));
}
return r;
}
[[noreturn]]
void hip_throw(const std::exception&);
inline
void associate_code_object_symbols_with_host_allocation(
const ELFIO::elfio& reader,
ELFIO::section* code_object_dynsym,
hsa_agent_t agent,
hsa_executable_t executable) {
if (!code_object_dynsym) return;
const auto undefined_symbols = copy_names_of_undefined_symbols(
ELFIO::symbol_section_accessor{reader, code_object_dynsym});
for (auto&& x : undefined_symbols) {
if (globals().find(x) != globals().cend()) return;
const auto it1 = symbol_addresses().find(x);
if (it1 == symbol_addresses().cend()) {
hip_throw(std::runtime_error{
"Global symbol: " + x + " is undefined."});
}
static std::mutex mtx;
std::lock_guard<std::mutex> lck{mtx};
if (globals().find(x) != globals().cend()) return;
globals().emplace(x, (void*)(it1->second.first));
void* p = nullptr;
hsa_amd_memory_lock(
reinterpret_cast<void*>(it1->second.first),
it1->second.second,
nullptr, // All agents.
0,
&p);
hsa_executable_agent_global_variable_define(
executable, agent, x.c_str(), p);
}
}
inline
void load_code_object_and_freeze_executable(
const std::string& file, hsa_agent_t agent, hsa_executable_t executable) {
// TODO: the following sequence is inefficient, should be refactored
// into a single load of the file and subsequent ELFIO
// processing.
static const auto cor_deleter = [](hsa_code_object_reader_t* p) {
if (!p) return;
hsa_code_object_reader_destroy(*p);
delete p;
};
using RAII_code_reader =
std::unique_ptr<hsa_code_object_reader_t, decltype(cor_deleter)>;
if (file.empty()) return;
RAII_code_reader tmp{new hsa_code_object_reader_t, cor_deleter};
hsa_code_object_reader_create_from_memory(
file.data(), file.size(), tmp.get());
hsa_executable_load_agent_code_object(
executable, agent, *tmp, nullptr, nullptr);
hsa_executable_freeze(executable, nullptr);
static std::vector<RAII_code_reader> code_readers;
static std::mutex mtx;
std::lock_guard<std::mutex> lck{mtx};
code_readers.push_back(move(tmp));
}
inline
hsa_executable_t load_executable(const std::string& file,
hsa_executable_t executable,
hsa_agent_t agent) {
ELFIO::elfio reader;
std::stringstream tmp{file};
if (!reader.load(tmp)) return hsa_executable_t{};
const auto code_object_dynsym = find_section_if(
reader, [](const ELFIO::section* x) {
return x->get_type() == SHT_DYNSYM;
});
associate_code_object_symbols_with_host_allocation(reader,
code_object_dynsym,
agent, executable);
load_code_object_and_freeze_executable(file, agent, executable);
return executable;
}
std::vector<hsa_agent_t> all_hsa_agents();
inline
__attribute__((visibility("hidden")))
const std::unordered_map<
hsa_agent_t, std::vector<hsa_executable_t>>& executables() {
static std::unordered_map<hsa_agent_t, std::vector<hsa_executable_t>> r;
static std::once_flag f;
std::call_once(f, []() {
for (auto&& agent : hip_impl::all_hsa_agents()) {
hsa_agent_iterate_isas(agent, [](hsa_isa_t x, void* pa) {
const auto it = code_object_blobs().find(x);
if (it == code_object_blobs().cend()) return HSA_STATUS_SUCCESS;
hsa_agent_t a = *static_cast<hsa_agent_t*>(pa);
for (auto&& blob : it->second) {
hsa_executable_t tmp = {};
hsa_executable_create_alt(
HSA_PROFILE_FULL,
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
nullptr,
&tmp);
// TODO: this is massively inefficient and only meant for
// illustration.
std::string blob_to_str{blob.cbegin(), blob.cend()};
tmp = load_executable(blob_to_str, tmp, a);
if (tmp.handle) r[a].push_back(tmp);
}
return HSA_STATUS_SUCCESS;
}, &agent);
}
});
return r;
}
inline
std::vector<std::pair<std::uintptr_t, std::string>> function_names_for(
const ELFIO::elfio& reader, ELFIO::section* symtab) {
std::vector<std::pair<std::uintptr_t, std::string>> r;
ELFIO::symbol_section_accessor symbols{reader, symtab};
for (auto i = 0u; i != symbols.get_symbols_num(); ++i) {
// TODO: this is boyscout code, caching the temporaries
// may be of worth.
auto tmp = read_symbol(symbols, i);
if (tmp.type != STT_FUNC) continue;
if (tmp.type == SHN_UNDEF) continue;
if (tmp.name.empty()) continue;
r.emplace_back(tmp.value, tmp.name);
}
return r;
}
inline
__attribute__((visibility("hidden")))
const std::unordered_map<std::uintptr_t, std::string>& function_names() {
static std::unordered_map<std::uintptr_t, std::string> r;
static std::once_flag f;
std::call_once(f, []() {
dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void*) {
ELFIO::elfio tmp;
const auto elf =
info->dlpi_addr ? info->dlpi_name : "/proc/self/exe";
if (!tmp.load(elf)) return 0;
const auto it = find_section_if(tmp, [](const ELFIO::section* x) {
return x->get_type() == SHT_SYMTAB;
});
if (!it) return 0;
auto names = function_names_for(tmp, it);
for (auto&& x : names) x.first += info->dlpi_addr;
r.insert(
std::make_move_iterator(names.begin()),
std::make_move_iterator(names.end()));
return 0;
}, nullptr);
});
return r;
}
inline
__attribute__((visibility("hidden")))
const std::unordered_map<
std::string, std::vector<hsa_executable_symbol_t>>& kernels() {
static std::unordered_map<
std::string, std::vector<hsa_executable_symbol_t>> r;
static std::once_flag f;
std::call_once(f, []() {
static const auto copy_kernels = [](
hsa_executable_t, hsa_agent_t, hsa_executable_symbol_t x, void*) {
if (type(x) == HSA_SYMBOL_KIND_KERNEL) r[name(x)].push_back(x);
return HSA_STATUS_SUCCESS;
};
for (auto&& agent_executables : executables()) {
for (auto&& executable : agent_executables.second) {
hsa_executable_iterate_agent_symbols(
executable, agent_executables.first, copy_kernels, nullptr);
}
}
});
return r;
}
inline
__attribute__((visibility("hidden")))
const std::unordered_map<
std::uintptr_t,
std::vector<std::pair<hsa_agent_t, Kernel_descriptor>>>& functions() {
static std::unordered_map<
std::uintptr_t,
std::vector<std::pair<hsa_agent_t, Kernel_descriptor>>> r;
static std::once_flag f;
std::call_once(f, []() {
for (auto&& function : function_names()) {
const auto it = kernels().find(function.second);
if (it == kernels().cend()) continue;
for (auto&& kernel_symbol : it->second) {
r[function.first].emplace_back(
agent(kernel_symbol),
Kernel_descriptor{kernel_object(kernel_symbol), it->first});
}
}
});
return r;
}
inline
std::size_t parse_args(
const std::string& metadata,
std::size_t f,
std::size_t l,
std::vector<std::pair<std::size_t, std::size_t>>& size_align) {
if (f == l) return f;
if (!size_align.empty()) return l;
do {
static constexpr size_t size_sz{5};
f = metadata.find("Size:", f) + size_sz;
if (l <= f) return f;
auto size = std::strtoul(&metadata[f], nullptr, 10);
static constexpr size_t align_sz{6};
f = metadata.find("Align:", f) + align_sz;
char* l{};
auto align = std::strtoul(&metadata[f], &l, 10);
f += (l - &metadata[f]) + 1;
size_align.emplace_back(size, align);
} while (true);
}
inline
void read_kernarg_metadata(
ELFIO::elfio& reader,
std::unordered_map<
std::string,
std::vector<std::pair<std::size_t, std::size_t>>>& kernargs) {
// TODO: this is inefficient.
auto it = find_section_if(reader, [](const ELFIO::section* x) {
return x->get_type() == SHT_NOTE;
});
if (!it) return;
const ELFIO::note_section_accessor acc{reader, it};
for (decltype(acc.get_notes_num()) i = 0; i != acc.get_notes_num(); ++i) {
ELFIO::Elf_Word type{};
std::string name{};
void* desc{};
ELFIO::Elf_Word desc_size{};
acc.get_note(i, type, name, desc, desc_size);
if (name != "AMD") continue; // TODO: switch to using NT_AMD_AMDGPU_HSA_METADATA.
std::string tmp{
static_cast<char*>(desc), static_cast<char*>(desc) + desc_size};
auto dx = tmp.find("Kernels:");
if (dx == std::string::npos) continue;
static constexpr decltype(tmp.size()) kernels_sz{8};
dx += kernels_sz;
do {
dx = tmp.find("Name:", dx);
if (dx == std::string::npos) break;
static constexpr decltype(tmp.size()) name_sz{5};
dx = tmp.find_first_not_of(" '", dx + name_sz);
auto fn = tmp.substr(dx, tmp.find_first_of("'\n", dx) - dx);
dx += fn.size();
auto dx1 = tmp.find("CodeProps", dx);
dx = tmp.find("Args:", dx);
if (dx1 < dx) {
dx = dx1;
continue;
}
if (dx == std::string::npos) break;
static constexpr decltype(tmp.size()) args_sz{5};
dx = parse_args(tmp, dx + args_sz, dx1, kernargs[fn]);
} while (true);
}
}
inline
__attribute__((visibility("hidden")))
const std::unordered_map<
std::string, std::vector<std::pair<std::size_t, std::size_t>>>& kernargs() {
static std::unordered_map<
std::string, std::vector<std::pair<std::size_t, std::size_t>>> r;
static std::once_flag f;
std::call_once(f, []() {
for (auto&& isa_blobs : code_object_blobs()) {
for (auto&& blob : isa_blobs.second) {
std::stringstream tmp{std::string{blob.cbegin(), blob.cend()}};
ELFIO::elfio reader;
if (!reader.load(tmp)) continue;
read_kernarg_metadata(reader, r);
}
}
});
return r;
}
} // Namespace hip_impl.
+1 -1
Просмотреть файл
@@ -4,7 +4,7 @@
#include "clara/clara.hpp"
#include "pstreams/pstream.h"
#include "../src/elfio/elfio.hpp"
#include "../include/hip/hcc_detail/elfio/elfio.hpp"
#include <unistd.h>
+11 -90
Просмотреть файл
@@ -41,99 +41,20 @@ using namespace std;
namespace hip_impl
{
namespace
hsa_agent_t target_agent(hipStream_t stream)
{
inline
string name(uintptr_t function_address)
{
const auto it = function_names().find(function_address);
if (it == function_names().cend()) {
throw runtime_error{
"Invalid function passed to hipLaunchKernelGGL."};
}
return it->second;
if (stream) {
return *static_cast<hsa_agent_t*>(
stream->locked_getAv()->get_hsa_agent());
}
inline
string name(hsa_agent_t agent)
{
char n[64] = {};
hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, n);
return string{n};
else if (
ihipGetTlsDefaultCtx() && ihipGetTlsDefaultCtx()->getDevice()) {
return ihipGetDevice(
ihipGetTlsDefaultCtx()->getDevice()->_deviceId)->_hsaAgent;
}
inline
hsa_agent_t target_agent(hipStream_t stream)
{
if (stream) {
return *static_cast<hsa_agent_t*>(
stream->locked_getAv()->get_hsa_agent());
}
else if (
ihipGetTlsDefaultCtx() && ihipGetTlsDefaultCtx()->getDevice()) {
return ihipGetDevice(
ihipGetTlsDefaultCtx()->getDevice()->_deviceId)->_hsaAgent;
}
else {
return *static_cast<hsa_agent_t*>(
accelerator{}.get_default_view().get_hsa_agent());
}
else {
return *static_cast<hsa_agent_t*>(
accelerator{}.get_default_view().get_hsa_agent());
}
}
void hipLaunchKernelGGLImpl(
uintptr_t function_address,
const dim3& numBlocks,
const dim3& dimBlocks,
uint32_t sharedMemBytes,
hipStream_t stream,
void** kernarg)
{
auto it0 = functions().find(function_address);
if (it0 == functions().cend()) {
// Re-init device code maps once again to help locate kernels
// loaded after HIP runtime initialization via means such as
// dlopen().
it0 = functions(true).find(function_address);
if (it0 == functions().cend()) {
throw runtime_error{
"No device code available for function: " +
name(function_address)
};
}
}
auto agent = target_agent(stream);
const auto it1 = find_if(
it0->second.cbegin(),
it0->second.cend(),
[=](const pair<hsa_agent_t, Kernel_descriptor>& x) {
return x.first == agent;
});
if (it1 == it0->second.cend()) {
throw runtime_error{
"No code available for function: " + name(function_address) +
", for agent: " + name(agent)
};
}
hipModuleLaunchKernel(
it1->second,
numBlocks.x,
numBlocks.y,
numBlocks.z,
dimBlocks.x,
dimBlocks.y,
dimBlocks.z,
sharedMemBytes,
stream,
nullptr,
kernarg);
}
}
+28 -2
Просмотреть файл
@@ -27,6 +27,7 @@ THE SOFTWARE.
* everywhere. This file is compiled and linked into apps running HIP / HCC path.
*/
#include <assert.h>
#include <exception>
#include <stdint.h>
#include <iostream>
#include <sstream>
@@ -237,7 +238,7 @@ hipError_t ihipSynchronize(void) {
//=================================================================================================
TidInfo::TidInfo() : _apiSeqNum(0) {
_shortTid = g_lastShortTid.fetch_add(1);
_pid = getpid();
_pid = getpid();
if (COMPILE_HIP_DB && HIP_TRACE_API) {
std::stringstream tid_ss;
@@ -2397,7 +2398,7 @@ void ihipStream_t::locked_copy2DAsync(void* dst, const void* src, size_t width,
crit->_av.copy2d_ext(src, dst, width, height, srcPitch, dstPitch, hcCopyDir, srcPtrInfo, dstPtrInfo,
copyDevice ? &copyDevice->getDevice()->_acc : nullptr,
forceUnpinnedCopy);
}
}
}
//-------------------------------------------------------------------------------------------------
@@ -2460,3 +2461,28 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view** a
//// TODO - add identifier numbers for streams and devices to help with debugging.
// TODO - add a contect sequence number for debug. Print operator<< ctx:0.1 (device.ctx)
namespace hip_impl {
std::vector<hsa_agent_t> all_hsa_agents() {
std::vector<hsa_agent_t> r{};
for (auto&& acc : hc::accelerator::get_all()) {
const auto agent = acc.get_hsa_agent();
if (!agent || !acc.is_hsa_accelerator()) continue;
r.emplace_back(*static_cast<hsa_agent_t*>(agent));
}
return r;
}
[[noreturn]]
void hip_throw(const std::exception& ex) {
#if defined(__cpp_exceptions)
throw ex;
#else
std::cerr << ex.what() << std::endl;
std::terminate();
#endif
}
} // Namespace hip_impl.
+28 -92
Просмотреть файл
@@ -955,37 +955,14 @@ hipError_t hipHostUnregister(void* hostPtr) {
return ihipLogStatus(hip_status);
}
namespace {
inline hipDeviceptr_t agent_address_for_symbol(const char* symbolName) {
hipDeviceptr_t r = nullptr;
namespace hip_impl {
hipError_t hipMemcpyToSymbol(void* dst, const void* src, size_t count,
size_t offset, hipMemcpyKind kind,
const char* symbol_name) {
HIP_INIT_SPECIAL_API(hipMemcpyToSymbol, (TRACE_MCMD), symbol_name, src,
count, offset, kind);
#if __hcc_workweek__ >= 17481
size_t byte_cnt = 0u;
ihipModuleGetGlobal(&r, &byte_cnt, 0, symbolName);
#else
auto ctx = ihipGetTlsDefaultCtx();
auto acc = ctx->getDevice()->_acc;
r = acc.get_symbol_address(symbolName);
#endif
return r;
}
} // namespace
hipError_t hipMemcpyToSymbol(const void* symbolName, const void* src, size_t count, size_t offset,
hipMemcpyKind kind) {
HIP_INIT_SPECIAL_API(hipMemcpyToSymbol, (TRACE_MCMD), symbolName, src, count, offset, kind);
if (symbolName == nullptr) {
return ihipLogStatus(hipErrorInvalidSymbol);
}
auto ctx = ihipGetTlsDefaultCtx();
hc::accelerator acc = ctx->getDevice()->_acc;
hipDeviceptr_t dst = agent_address_for_symbol(static_cast<const char*>(symbolName));
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst);
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbol_name, dst);
if (dst == nullptr) {
return ihipLogStatus(hipErrorInvalidSymbol);
@@ -1003,21 +980,13 @@ hipError_t hipMemcpyToSymbol(const void* symbolName, const void* src, size_t cou
return ihipLogStatus(hipSuccess);
}
hipError_t hipMemcpyFromSymbol(void* dst, const void* src, size_t count,
size_t offset, hipMemcpyKind kind,
const char* symbol_name) {
HIP_INIT_SPECIAL_API(hipMemcpyFromSymbol, (TRACE_MCMD), symbol_name, dst,
count, offset, kind);
hipError_t hipMemcpyFromSymbol(void* dst, const void* symbolName, size_t count, size_t offset,
hipMemcpyKind kind) {
HIP_INIT_SPECIAL_API(hipMemcpyFromSymbol, (TRACE_MCMD), symbolName, dst, count, offset, kind);
if (symbolName == nullptr) {
return ihipLogStatus(hipErrorInvalidSymbol);
}
auto ctx = ihipGetTlsDefaultCtx();
hc::accelerator acc = ctx->getDevice()->_acc;
hipDeviceptr_t src = agent_address_for_symbol(static_cast<const char*>(symbolName));
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst);
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbol_name, dst);
if (dst == nullptr) {
return ihipLogStatus(hipErrorInvalidSymbol);
@@ -1036,27 +1005,19 @@ hipError_t hipMemcpyFromSymbol(void* dst, const void* symbolName, size_t count,
}
hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void* src, size_t count,
size_t offset, hipMemcpyKind kind, hipStream_t stream) {
HIP_INIT_SPECIAL_API(hipMemcpyToSymbolAsync, (TRACE_MCMD), symbolName, src, count, offset, kind, stream);
hipError_t hipMemcpyToSymbolAsync(void* dst, const void* src, size_t count,
size_t offset, hipMemcpyKind kind,
hipStream_t stream, const char* symbol_name) {
HIP_INIT_SPECIAL_API(hipMemcpyToSymbolAsync, (TRACE_MCMD), symbol_name, src,
count, offset, kind, stream);
if (symbolName == nullptr) {
return ihipLogStatus(hipErrorInvalidSymbol);
}
hipError_t e = hipSuccess;
auto ctx = ihipGetTlsDefaultCtx();
hc::accelerator acc = ctx->getDevice()->_acc;
hipDeviceptr_t dst = agent_address_for_symbol(static_cast<const char*>(symbolName));
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst);
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbol_name, dst);
if (dst == nullptr) {
return ihipLogStatus(hipErrorInvalidSymbol);
}
hipError_t e = hipSuccess;
if (stream) {
try {
hip_internal::memcpyAsync((char*)dst+offset, src, count, kind, stream);
@@ -1070,28 +1031,19 @@ hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void* src, size_
return ihipLogStatus(e);
}
hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* src, size_t count,
size_t offset, hipMemcpyKind kind,
hipStream_t stream, const char* symbol_name) {
HIP_INIT_SPECIAL_API(hipMemcpyFromSymbolAsync, (TRACE_MCMD), symbol_name,
dst, count, offset, kind, stream);
hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName, size_t count, size_t offset,
hipMemcpyKind kind, hipStream_t stream) {
HIP_INIT_SPECIAL_API(hipMemcpyFromSymbolAsync, (TRACE_MCMD), symbolName, dst, count, offset, kind, stream);
if (symbolName == nullptr) {
return ihipLogStatus(hipErrorInvalidSymbol);
}
hipError_t e = hipSuccess;
auto ctx = ihipGetTlsDefaultCtx();
hc::accelerator acc = ctx->getDevice()->_acc;
hipDeviceptr_t src = agent_address_for_symbol(static_cast<const char*>(symbolName));
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, src);
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbol_name, src);
if (src == nullptr || dst == nullptr) {
return ihipLogStatus(hipErrorInvalidSymbol);
}
hipError_t e = hipSuccess;
stream = ihipSyncAndResolveStream(stream);
if (stream) {
try {
@@ -1105,23 +1057,7 @@ hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName, size_t co
return ihipLogStatus(e);
}
hipError_t hipGetSymbolAddress(void** devPtr, const void* symbolName) {
HIP_INIT_API(hipGetSymbolAddress, devPtr, symbolName);
size_t size = 0;
return ihipModuleGetGlobal(devPtr, &size, 0, static_cast<const char*>(symbolName));
}
hipError_t hipGetSymbolSize(size_t* size, const void* symbolName) {
HIP_INIT_API(hipGetSymbolSize, size, symbolName);
void* devPtr = nullptr;
return ihipModuleGetGlobal(&devPtr, size, 0, static_cast<const char*>(symbolName));
}
} // Namespace hip_impl.
//---
hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) {
+80 -144
Просмотреть файл
@@ -20,11 +20,11 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "elfio/elfio.hpp"
#include "hip/hip_runtime.h"
#include "hip/hcc_detail/elfio/elfio.hpp"
#include "hip/hcc_detail/hsa_helpers.hpp"
#include "hip/hcc_detail/program_state.hpp"
#include "hip_hcc_internal.h"
#include "hsa_helpers.hpp"
#include "trace_helper.h"
#include <hsa/amd_hsa_kernel_code.h>
@@ -52,7 +52,6 @@ THE SOFTWARE.
// TODO Use Pool APIs from HCC to get memory regions.
using namespace ELFIO;
using namespace hip_impl;
using namespace std;
// calculate MD5 checksum
@@ -268,13 +267,33 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent));
}
namespace {
struct Agent_global {
string name;
hipDeviceptr_t address;
uint32_t byte_cnt;
};
namespace hip_impl {
hsa_executable_t executable_for(hipModule_t hmod) {
return hmod->executable;
}
const std::string& hash_for(hipModule_t hmod) {
return hmod->hash;
}
hsa_agent_t this_agent() {
auto ctx = ihipGetTlsDefaultCtx();
if (!ctx) throw runtime_error{"No active HIP context."};
auto device = ctx->getDevice();
if (!device) throw runtime_error{"No device available for HIP."};
ihipDevice_t* currentDevice = ihipGetDevice(device->_deviceId);
if (!currentDevice) throw runtime_error{"No active device for HIP."};
return currentDevice->_hsaAgent;
}
} // Namespace hip_impl.
namespace {
inline void track(const Agent_global& x, hsa_agent_t agent) {
tprintf(DB_MEM, " add variable '%s' with ptr=%p size=%u to tracker\n", x.name.c_str(),
x.address, x.byte_cnt);
@@ -299,6 +318,8 @@ inline void track(const Agent_global& x, hsa_agent_t agent) {
template <typename Container = vector<Agent_global>>
inline hsa_status_t copy_agent_global_variables(hsa_executable_t, hsa_agent_t agent,
hsa_executable_symbol_t x, void* out) {
using namespace hip_impl;
assert(out);
hsa_symbol_kind_t t = {};
@@ -313,90 +334,9 @@ inline hsa_status_t copy_agent_global_variables(hsa_executable_t, hsa_agent_t ag
return HSA_STATUS_SUCCESS;
}
inline hsa_agent_t this_agent() {
auto ctx = ihipGetTlsDefaultCtx();
if (!ctx) throw runtime_error{"No active HIP context."};
auto device = ctx->getDevice();
if (!device) throw runtime_error{"No device available for HIP."};
ihipDevice_t* currentDevice = ihipGetDevice(device->_deviceId);
if (!currentDevice) throw runtime_error{"No active device for HIP."};
return currentDevice->_hsaAgent;
}
inline vector<Agent_global> read_agent_globals(hsa_agent_t agent, hsa_executable_t executable) {
vector<Agent_global> r;
hsa_executable_iterate_agent_symbols(executable, agent, copy_agent_global_variables, &r);
return r;
}
template <typename ForwardIterator>
pair<hipDeviceptr_t, size_t> read_global_description(ForwardIterator f, ForwardIterator l,
const char* name) {
const auto it = std::find_if(f, l, [=](const Agent_global& x) { return x.name == name; });
return it == l ? make_pair(nullptr, 0u) : make_pair(it->address, it->byte_cnt);
}
hipError_t read_agent_global_from_module(hipDeviceptr_t* dptr, size_t* bytes, hipModule_t hmod,
const char* name) {
// the key of the map would the hash of code object associated with the
// hipModule_t instance
static unordered_map<std::string, vector<Agent_global>> agent_globals;
auto key = hmod->hash;
if (agent_globals.count(key) == 0) {
static mutex mtx;
lock_guard<mutex> lck{mtx};
if (agent_globals.count(key) == 0) {
agent_globals.emplace(key, read_agent_globals(this_agent(), hmod->executable));
}
}
const auto it0 = agent_globals.find(key);
if (it0 == agent_globals.cend()) {
throw runtime_error{"agent_globals data structure corrupted."};
}
tie(*dptr, *bytes) = read_global_description(it0->second.cbegin(), it0->second.cend(), name);
return *dptr ? hipSuccess : hipErrorNotFound;
}
hipError_t read_agent_global_from_process(hipDeviceptr_t* dptr, size_t* bytes, const char* name) {
static unordered_map<hsa_agent_t, vector<Agent_global>> agent_globals;
static std::once_flag f;
call_once(f, []() {
for (auto&& agent_executables : hip_impl::executables()) {
vector<Agent_global> tmp0;
for (auto&& executable : agent_executables.second) {
auto tmp1 = read_agent_globals(agent_executables.first, executable);
tmp0.insert(tmp0.end(), make_move_iterator(tmp1.begin()),
make_move_iterator(tmp1.end()));
}
agent_globals.emplace(agent_executables.first, move(tmp0));
}
});
const auto it = agent_globals.find(this_agent());
if (it == agent_globals.cend()) return hipErrorNotInitialized;
tie(*dptr, *bytes) = read_global_description(it->second.cbegin(), it->second.cend(), name);
return *dptr ? hipSuccess : hipErrorNotFound;
}
hsa_executable_symbol_t find_kernel_by_name(hsa_executable_t executable, const char* kname) {
using namespace hip_impl;
pair<const char*, hsa_executable_symbol_t> r{kname, {}};
hsa_executable_iterate_agent_symbols(
@@ -418,8 +358,8 @@ hsa_executable_symbol_t find_kernel_by_name(hsa_executable_t executable, const c
return r.second;
}
string read_elf_file_as_string(
const void* file) { // Precondition: file points to an ELF image that was BITWISE loaded
string read_elf_file_as_string(const void* file) {
// Precondition: file points to an ELF image that was BITWISE loaded
// into process accessible memory, and not one loaded by
// the loader. This is because in the latter case
// alignment may differ, which will break the size
@@ -428,15 +368,18 @@ string read_elf_file_as_string(
// Little Endian.
if (!file) return {};
auto h = static_cast<const Elf64_Ehdr*>(file);
auto h = static_cast<const ELFIO::Elf64_Ehdr*>(file);
auto s = static_cast<const char*>(file);
// This assumes the common case of SHT being the last part of the ELF.
auto sz = sizeof(Elf64_Ehdr) + h->e_shoff + h->e_shentsize * h->e_shnum;
auto sz =
sizeof(ELFIO::Elf64_Ehdr) + h->e_shoff + h->e_shentsize * h->e_shnum;
return string{s, s + sz};
}
string code_object_blob_for_agent(const void* maybe_bundled_code, hsa_agent_t agent) {
using namespace hip_impl;
if (!maybe_bundled_code) return {};
Bundled_code_header tmp{maybe_bundled_code};
@@ -454,9 +397,22 @@ string code_object_blob_for_agent(const void* maybe_bundled_code, hsa_agent_t ag
return string{it->blob.cbegin(), it->blob.cend()};
}
} // namespace
} // Unnamed namespace.
namespace hip_impl {
vector<Agent_global> read_agent_globals(hsa_agent_t agent,
hsa_executable_t executable) {
vector<Agent_global> r;
hsa_executable_iterate_agent_symbols(
executable, agent, copy_agent_global_variables, &r);
return r;
}
} // Namespace hip_impl.
hipError_t ihipModuleGetFunction(hipFunction_t* func, hipModule_t hmod, const char* name) {
using namespace hip_impl;
if (!func || !name) return hipErrorInvalidValue;
@@ -485,58 +441,36 @@ hipError_t hipModuleGetFunction(hipFunction_t* hfunc, hipModule_t hmod, const ch
return ihipLogStatus(ihipModuleGetFunction(hfunc, hmod, name));
}
hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes, hipModule_t hmod,
const char* name) {
HIP_INIT_API(hipModuleGetGlobal, dptr, bytes, hmod, name);
namespace {
hipFuncAttributes make_function_attributes(const amd_kernel_code_t& header) {
hipFuncAttributes r{};
return ihipLogStatus(ihipModuleGetGlobal(dptr, bytes, hmod, name));
}
hipDeviceProp_t prop{};
hipGetDeviceProperties(&prop, ihipGetTlsDefaultCtx()->getDevice()->_deviceId);
// TODO: at the moment there is no way to query the count of registers
// available per CU, therefore we hardcode it to 64 KiRegisters.
prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024;
hipError_t ihipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes, hipModule_t hmod,
const char* name) {
if (!dptr || !bytes) return hipErrorInvalidValue;
if (!name) return hipErrorNotInitialized;
const auto r = hmod ? read_agent_global_from_module(dptr, bytes, hmod, name)
: read_agent_global_from_process(dptr, bytes, name);
r.localSizeBytes = header.workitem_private_segment_byte_size;
r.sharedSizeBytes = header.workgroup_group_segment_byte_size;
r.maxDynamicSharedSizeBytes = prop.sharedMemPerBlock - r.sharedSizeBytes;
r.numRegs = header.workitem_vgpr_count;
r.maxThreadsPerBlock = r.numRegs ?
std::min(prop.maxThreadsPerBlock, prop.regsPerBlock / r.numRegs) :
prop.maxThreadsPerBlock;
r.binaryVersion =
header.amd_machine_version_major * 10 +
header.amd_machine_version_minor;
r.ptxVersion = prop.major * 10 + prop.minor; // HIP currently presents itself as PTX 3.0.
return r;
}
namespace
{
inline
hipFuncAttributes make_function_attributes(const amd_kernel_code_t& header)
{
hipFuncAttributes r{};
hipDeviceProp_t prop{};
hipGetDeviceProperties(
&prop, ihipGetTlsDefaultCtx()->getDevice()->_deviceId);
// TODO: at the moment there is no way to query the count of registers
// available per CU, therefore we hardcode it to 64 KiRegisters.
prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024;
r.localSizeBytes = header.workitem_private_segment_byte_size;
r.sharedSizeBytes = header.workgroup_group_segment_byte_size;
r.maxDynamicSharedSizeBytes =
prop.sharedMemPerBlock - r.sharedSizeBytes;
r.numRegs = header.workitem_vgpr_count;
r.maxThreadsPerBlock = r.numRegs ?
std::min(prop.maxThreadsPerBlock, prop.regsPerBlock / r.numRegs) :
prop.maxThreadsPerBlock;
r.binaryVersion =
header.amd_machine_version_major * 10 +
header.amd_machine_version_minor;
r.ptxVersion = prop.major * 10 + prop.minor; // HIP currently presents itself as PTX 3.0.
return r;
}
}
} // Unnamed namespace.
hipError_t hipFuncGetAttributes(hipFuncAttributes* attr, const void* func)
{
using namespace hip_impl;
if (!attr) return hipErrorInvalidValue;
if (!func) return hipErrorInvalidDeviceFunction;
@@ -564,6 +498,7 @@ hipError_t hipFuncGetAttributes(hipFuncAttributes* attr, const void* func)
}
hipError_t ihipModuleLoadData(hipModule_t* module, const void* image) {
using namespace hip_impl;
if (!module) return hipErrorInvalidValue;
@@ -585,9 +520,8 @@ hipError_t ihipModuleLoadData(hipModule_t* module, const void* image) {
auto content = tmp.empty() ? read_elf_file_as_string(image) : tmp;
(*module)->executable = hip_impl::load_executable(content,
(*module)->executable,
this_agent());
(*module)->executable = load_executable(content, (*module)->executable,
this_agent());
// compute the hash of the code object
(*module)->hash = checksum(content.length(), content.data());
@@ -621,6 +555,8 @@ hipError_t hipModuleLoadDataEx(hipModule_t* module, const void* image, unsigned
}
hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const char* name) {
using namespace hip_impl;
HIP_INIT_API(hipModuleGetTexRef, texRef, hmod, name);
hipError_t ret = hipErrorNotFound;
-659
Просмотреть файл
@@ -1,659 +0,0 @@
#include "../include/hip/hcc_detail/program_state.hpp"
#include "../include/hip/hcc_detail/code_object_bundle.hpp"
#include "hip_hcc_internal.h"
#include "hsa_helpers.hpp"
#include "trace_helper.h"
#include "elfio/elfio.hpp"
#include <link.h>
#include <hsa/hsa.h>
#include <hsa/hsa_ext_amd.h>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <memory>
#include <mutex>
#include <sstream>
#include <stdexcept>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
using namespace ELFIO;
using namespace hip_impl;
using namespace std;
namespace {
struct Symbol {
string name;
ELFIO::Elf64_Addr value = 0;
Elf_Xword size = 0;
Elf_Half sect_idx = 0;
uint8_t bind = 0;
uint8_t type = 0;
uint8_t other = 0;
};
inline Symbol read_symbol(const symbol_section_accessor& section, unsigned int idx) {
assert(idx < section.get_symbols_num());
Symbol r;
section.get_symbol(idx, r.name, r.value, r.size, r.bind, r.type, r.sect_idx, r.other);
return r;
}
template <typename P>
inline section* find_section_if(elfio& reader, P p) {
const auto it = find_if(reader.sections.begin(), reader.sections.end(), move(p));
return it != reader.sections.end() ? *it : nullptr;
}
vector<string> copy_names_of_undefined_symbols(const symbol_section_accessor& section) {
vector<string> r;
for (auto i = 0u; i != section.get_symbols_num(); ++i) {
// TODO: this is boyscout code, caching the temporaries
// may be of worth.
auto tmp = read_symbol(section, i);
if (tmp.sect_idx == SHN_UNDEF && !tmp.name.empty()) {
r.push_back(std::move(tmp.name));
}
}
return r;
}
const std::unordered_map<std::string, std::pair<ELFIO::Elf64_Addr, ELFIO::Elf_Xword>>&
symbol_addresses(bool rebuild = false) {
static unordered_map<string, pair<Elf64_Addr, Elf_Xword>> r;
static once_flag f;
auto cons = [rebuild]() {
if (rebuild) {
r.clear();
}
dl_iterate_phdr(
[](dl_phdr_info* info, size_t, void*) {
static constexpr const char self[] = "/proc/self/exe";
elfio reader;
static unsigned int iter = 0u;
if (reader.load(!iter ? self : info->dlpi_name)) {
auto it = find_section_if(
reader, [](const class section* x) { return x->get_type() == SHT_SYMTAB; });
if (it) {
const symbol_section_accessor symtab{reader, it};
for (auto i = 0u; i != symtab.get_symbols_num(); ++i) {
auto tmp = read_symbol(symtab, i);
if (tmp.type == STT_OBJECT && tmp.sect_idx != SHN_UNDEF) {
const auto addr = tmp.value + (iter ? info->dlpi_addr : 0);
r.emplace(move(tmp.name), make_pair(addr, tmp.size));
}
}
}
++iter;
}
return 0;
},
nullptr);
};
call_once(f, cons);
if (rebuild) {
cons();
}
return r;
}
void associate_code_object_symbols_with_host_allocation(const elfio& reader,
section* code_object_dynsym,
hsa_agent_t agent,
hsa_executable_t executable) {
if (!code_object_dynsym) return;
const auto undefined_symbols =
copy_names_of_undefined_symbols(symbol_section_accessor{reader, code_object_dynsym});
for (auto&& x : undefined_symbols) {
if (globals().find(x) != globals().cend()) return;
const auto it1 = symbol_addresses().find(x);
if (it1 == symbol_addresses().cend()) {
throw runtime_error{"Global symbol: " + x + " is undefined."};
}
static mutex mtx;
lock_guard<mutex> lck{mtx};
if (globals().find(x) != globals().cend()) return;
globals().emplace(x, (void*)(it1->second.first));
void* p = nullptr;
hsa_amd_memory_lock(reinterpret_cast<void*>(it1->second.first), it1->second.second,
nullptr, // All agents.
0, &p);
hsa_executable_agent_global_variable_define(executable, agent, x.c_str(), p);
}
}
vector<char> code_object_blob_for_process() {
static constexpr const char self[] = "/proc/self/exe";
static constexpr const char kernel_section[] = ".kernel";
elfio reader;
if (!reader.load(self)) {
throw runtime_error{"Failed to load ELF file for current process."};
}
auto kernels =
find_section_if(reader, [](const section* x) { return x->get_name() == kernel_section; });
vector<char> r;
if (kernels) {
r.insert(r.end(), kernels->get_data(), kernels->get_data() + kernels->get_size());
}
return r;
}
const unordered_map<hsa_isa_t, vector<vector<char>>>& code_object_blobs(bool rebuild = false) {
static unordered_map<hsa_isa_t, vector<vector<char>>> r;
static once_flag f;
auto cons = [rebuild]() {
// names of shared libraries who .kernel sections already loaded
static unordered_set<string> lib_names;
static vector<vector<char>> blobs{code_object_blob_for_process()};
if (rebuild) {
r.clear();
blobs.clear();
}
dl_iterate_phdr(
[](dl_phdr_info* info, std::size_t, void*) {
elfio tmp;
if ((lib_names.find(info->dlpi_name) == lib_names.end()) &&
(tmp.load(info->dlpi_name))) {
const auto it = find_section_if(
tmp, [](const section* x) { return x->get_name() == ".kernel"; });
if (it) {
blobs.emplace_back(
it->get_data(), it->get_data() + it->get_size());
// register the shared library as already loaded
lib_names.emplace(info->dlpi_name);
}
}
return 0;
},
nullptr);
for (auto&& blob : blobs) {
for (auto sub_blob = blob.begin(); sub_blob != blob.end(); ) {
Bundled_code_header tmp(sub_blob, blob.end());
if (valid(tmp)) {
for (auto&& bundle : bundles(tmp)) {
r[triple_to_hsa_isa(bundle.triple)].push_back(bundle.blob);
}
sub_blob+=tmp.bundled_code_size;
}
else {
break;
}
}
}
};
call_once(f, cons);
if (rebuild) {
cons();
}
return r;
}
vector<pair<uintptr_t, string>> function_names_for(const elfio& reader, section* symtab) {
vector<pair<uintptr_t, string>> r;
symbol_section_accessor symbols{reader, symtab};
for (auto i = 0u; i != symbols.get_symbols_num(); ++i) {
// TODO: this is boyscout code, caching the temporaries
// may be of worth.
auto tmp = read_symbol(symbols, i);
if (tmp.type == STT_FUNC && tmp.sect_idx != SHN_UNDEF && !tmp.name.empty()) {
r.emplace_back(tmp.value, tmp.name);
}
}
return r;
}
const vector<pair<uintptr_t, string>>& function_names_for_process(bool rebuild = false) {
static constexpr const char self[] = "/proc/self/exe";
static vector<pair<uintptr_t, string>> r;
static once_flag f;
auto cons = [rebuild]() {
elfio reader;
if (!reader.load(self)) {
throw runtime_error{"Failed to load the ELF file for the current process."};
}
auto symtab =
find_section_if(reader, [](const section* x) { return x->get_type() == SHT_SYMTAB; });
if (symtab) r = function_names_for(reader, symtab);
};
call_once(f, cons);
if (rebuild) {
cons();
}
return r;
}
const unordered_map<string, vector<hsa_executable_symbol_t>>& kernels(bool rebuild = false) {
static unordered_map<string, vector<hsa_executable_symbol_t>> r;
static once_flag f;
auto cons = [rebuild]() {
if (rebuild) {
r.clear();
executables(rebuild);
}
static const auto copy_kernels = [](hsa_executable_t, hsa_agent_t,
hsa_executable_symbol_t s, void*) {
if (type(s) == HSA_SYMBOL_KIND_KERNEL) r[name(s)].push_back(s);
return HSA_STATUS_SUCCESS;
};
for (auto&& agent_executables : executables()) {
for (auto&& executable : agent_executables.second) {
hsa_executable_iterate_agent_symbols(executable, agent_executables.first,
copy_kernels, nullptr);
}
}
};
call_once(f, cons);
if (rebuild) {
cons();
}
return r;
}
void load_code_object_and_freeze_executable(
const string& file, hsa_agent_t agent,
hsa_executable_t
executable) { // TODO: the following sequence is inefficient, should be refactored
// into a single load of the file and subsequent ELFIO
// processing.
static const auto cor_deleter = [](hsa_code_object_reader_t* p) {
if (p) {
hsa_code_object_reader_destroy(*p);
delete p;
}
};
using RAII_code_reader = unique_ptr<hsa_code_object_reader_t, decltype(cor_deleter)>;
if (!file.empty()) {
RAII_code_reader tmp{new hsa_code_object_reader_t, cor_deleter};
hsa_code_object_reader_create_from_memory(file.data(), file.size(), tmp.get());
hsa_executable_load_agent_code_object(executable, agent, *tmp, nullptr, nullptr);
hsa_executable_freeze(executable, nullptr);
static vector<RAII_code_reader> code_readers;
static mutex mtx;
lock_guard<mutex> lck{mtx};
code_readers.push_back(move(tmp));
}
}
size_t parse_args(
const string& metadata,
size_t f,
size_t l,
vector<pair<size_t, size_t>>& size_align) {
if (f == l) return f;
if (!size_align.empty()) return l;
do {
static constexpr size_t size_sz{5};
f = metadata.find("Size:", f) + size_sz;
if (l <= f) return f;
auto size = strtoul(&metadata[f], nullptr, 10);
static constexpr size_t align_sz{6};
f = metadata.find("Align:", f) + align_sz;
char* l{};
auto align = strtoul(&metadata[f], &l, 10);
f += (l - &metadata[f]) + 1;
size_align.emplace_back(size, align);
} while (true);
}
void read_kernarg_metadata(
elfio& reader,
unordered_map<string, vector<pair<size_t, size_t>>>& kernargs)
{ // TODO: this is inefficient.
auto it = find_section_if(
reader, [](const section* x) { return x->get_type() == SHT_NOTE; });
if (!it) return;
const note_section_accessor acc{reader, it};
for (decltype(acc.get_notes_num()) i = 0; i != acc.get_notes_num(); ++i) {
ELFIO::Elf_Word type{};
string name{};
void* desc{};
Elf_Word desc_size{};
acc.get_note(i, type, name, desc, desc_size);
if (name != "AMD") continue; // TODO: switch to using NT_AMD_AMDGPU_HSA_METADATA.
string tmp{
static_cast<char*>(desc), static_cast<char*>(desc) + desc_size};
auto dx = tmp.find("Kernels:");
if (dx == string::npos) continue;
static constexpr decltype(tmp.size()) kernels_sz{8};
dx += kernels_sz;
do {
dx = tmp.find("Name:", dx);
if (dx == string::npos) break;
static constexpr decltype(tmp.size()) name_sz{5};
dx = tmp.find_first_not_of(" '", dx + name_sz);
auto fn = tmp.substr(dx, tmp.find_first_of("'\n", dx) - dx);
dx += fn.size();
auto dx1 = tmp.find("CodeProps", dx);
dx = tmp.find("Args:", dx);
if (dx1 < dx) {
dx = dx1;
continue;
}
if (dx == string::npos) break;
static constexpr decltype(tmp.size()) args_sz{5};
dx = parse_args(tmp, dx + args_sz, dx1, kernargs[fn]);
} while (true);
}
}
} // namespace
namespace hip_impl {
const unordered_map<hsa_agent_t, vector<hsa_executable_t>>&
executables(bool rebuild) { // TODO: This leaks the hsa_executable_ts, it should use RAII.
static unordered_map<hsa_agent_t, vector<hsa_executable_t>> r;
static once_flag f;
auto cons = [rebuild]() {
static const auto accelerators = hc::accelerator::get_all();
if (rebuild) {
// do NOT clear r so we reuse instances of hsa_executable_t
// created previously
code_object_blobs(rebuild);
}
for (auto&& acc : accelerators) {
auto agent = static_cast<hsa_agent_t*>(acc.get_hsa_agent());
if (!agent || !acc.is_hsa_accelerator()) continue;
hsa_agent_iterate_isas(*agent,
[](hsa_isa_t x, void* pa) {
const auto it = code_object_blobs().find(x);
if (it != code_object_blobs().cend()) {
hsa_agent_t a = *static_cast<hsa_agent_t*>(pa);
for (auto&& blob : it->second) {
hsa_executable_t tmp = {};
hsa_executable_create_alt(
HSA_PROFILE_FULL,
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, nullptr,
&tmp);
// TODO: this is massively inefficient and only
// meant for illustration.
string blob_to_str{blob.cbegin(), blob.cend()};
tmp = load_executable(blob_to_str, tmp, a);
if (tmp.handle) r[a].push_back(tmp);
}
}
return HSA_STATUS_SUCCESS;
},
agent);
}
};
call_once(f, cons);
if (rebuild) {
cons();
}
return r;
}
const unordered_map<uintptr_t, string>& function_names(bool rebuild) {
static unordered_map<uintptr_t, string> r{function_names_for_process().cbegin(),
function_names_for_process().cend()};
static once_flag f;
auto cons = [rebuild]() {
if (rebuild) {
r.clear();
function_names_for_process(rebuild);
r.insert(function_names_for_process().cbegin(),
function_names_for_process().cend());
}
dl_iterate_phdr(
[](dl_phdr_info* info, size_t, void*) {
elfio tmp;
if (tmp.load(info->dlpi_name)) {
const auto it = find_section_if(
tmp, [](const section* x) { return x->get_type() == SHT_SYMTAB; });
if (it) {
auto n = function_names_for(tmp, it);
for (auto&& f : n) f.first += info->dlpi_addr;
r.insert(make_move_iterator(n.begin()), make_move_iterator(n.end()));
}
}
return 0;
},
nullptr);
};
call_once(f, cons);
if (rebuild) {
static mutex mtx;
lock_guard<mutex> lck{mtx};
cons();
}
return r;
}
const unordered_map<uintptr_t, vector<pair<hsa_agent_t, Kernel_descriptor>>>& functions(bool rebuild) {
static unordered_map<uintptr_t, vector<pair<hsa_agent_t, Kernel_descriptor>>> r;
static once_flag f;
auto cons = [rebuild]() {
if (rebuild) {
// do NOT clear r so we reuse instances of pair<hsa_agent_t, Kernel_descriptor>
// created previously
function_names(rebuild);
kernargs(rebuild);
kernels(rebuild);
globals(rebuild);
}
for (auto&& function : function_names()) {
const auto it = kernels().find(function.second);
if (it != kernels().cend()) {
for (auto&& kernel_symbol : it->second) {
r[function.first].emplace_back(
agent(kernel_symbol),
Kernel_descriptor{kernel_object(kernel_symbol), it->first});
}
}
}
};
call_once(f, cons);
if (rebuild) {
static mutex mtx;
lock_guard<mutex> lck{mtx};
cons();
}
return r;
}
unordered_map<string, void*>& globals(bool rebuild) {
static unordered_map<string, void*> r;
static once_flag f;
auto cons =[rebuild]() {
if (rebuild) {
r.clear();
symbol_addresses(rebuild);
}
r.reserve(symbol_addresses().size());
};
call_once(f, cons);
if (rebuild) {
cons();
}
return r;
}
const unordered_map<string, vector<pair<size_t, size_t>>>& kernargs(
bool rebuild) {
static unordered_map<string, vector<pair<size_t, size_t>>> r;
static once_flag f;
static const auto build_map = [](decltype(r)& x) {
for (auto&& isa_blobs : code_object_blobs()) {
for (auto&& blob : isa_blobs.second) {
stringstream tmp{std::string{blob.cbegin(), blob.cend()}};
elfio reader;
if (!reader.load(tmp)) continue;
read_kernarg_metadata(reader, x);
}
}
};
call_once(f, []() { r.reserve(function_names().size()); build_map(r); });
if (rebuild) {
static mutex mtx;
thread_local static decltype(r) tmp;
{
lock_guard<mutex> lck{mtx};
tmp.insert(r.cbegin(), r.cend()); // Should use merge in C++17.
}
build_map(tmp);
lock_guard<mutex> lck{mtx};
r.insert(tmp.cbegin(), tmp.cend());
}
return r;
}
hsa_executable_t load_executable(const string& file, hsa_executable_t executable,
hsa_agent_t agent) {
elfio reader;
stringstream tmp{file};
if (!reader.load(tmp)) return hsa_executable_t{};
const auto code_object_dynsym = find_section_if(
reader, [](const ELFIO::section* x) { return x->get_type() == SHT_DYNSYM; });
associate_code_object_symbols_with_host_allocation(reader, code_object_dynsym, agent,
executable);
load_code_object_and_freeze_executable(file, agent, executable);
return executable;
}
// HIP startup kernel loader logic
// When enabled HIP_STARTUP_LOADER, HIP will load the kernels and setup
// the function symbol map on program startup
extern "C" void __attribute__((constructor)) __startup_kernel_loader_init() {
int hip_startup_loader=0;
if (std::getenv("HIP_STARTUP_LOADER"))
hip_startup_loader = atoi(std::getenv("HIP_STARTUP_LOADER"));
if (hip_startup_loader) functions(true);
}
extern "C" void __attribute__((destructor)) __startup_kernel_loader_fini() {
}
} // Namespace hip_impl.