dlopen() fixes (#929)
* Initial attempt to switch over to internally linked state. * Add missing CMake update. * hipLaunchKernelGGLImpl must be inline as well. Ensure internal linkage. * Ensure global retrieval uses internally linked state. * Hide HC in the implementation. Minimise ADL woes. * Strange software exists, and must be catered to. * Use a less spammy mechanism for ensuring internal linkage / non-export. * Remove leftover internal detail.
Этот коммит содержится в:
коммит произвёл
Maneesh Gupta
родитель
2d67bc5e9c
Коммит
ea0fcf3e61
@@ -240,7 +240,6 @@ if(HIP_PLATFORM STREQUAL "hcc")
|
||||
src/hip_surface.cpp
|
||||
src/hip_intercept.cpp
|
||||
src/env.cpp
|
||||
src/program_state.cpp
|
||||
src/h2f.cpp)
|
||||
|
||||
execute_process(COMMAND ${HCC_HOME}/bin/hcc-config --ldflags OUTPUT_VARIABLE HCC_LD_FLAGS)
|
||||
|
||||
@@ -31,6 +31,7 @@ THE SOFTWARE.
|
||||
#include "hip/hip_hcc.h"
|
||||
#include "hip_runtime.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
@@ -105,21 +106,13 @@ inline std::vector<std::uint8_t> make_kernarg(
|
||||
|
||||
auto it = function_names().find(reinterpret_cast<std::uintptr_t>(kernel));
|
||||
if (it == function_names().cend()) {
|
||||
it =
|
||||
function_names(true).find(reinterpret_cast<std::uintptr_t>(kernel));
|
||||
if (it == function_names().cend()) {
|
||||
throw std::runtime_error{"Undefined __global__ function."};
|
||||
}
|
||||
hip_throw(std::runtime_error{"Undefined __global__ function."});
|
||||
}
|
||||
|
||||
auto it1 = kernargs().find(it->second);
|
||||
if (it1 == kernargs().end()) {
|
||||
it1 = kernargs(true).find(it->second);
|
||||
|
||||
if (it1 == kernargs().end()) {
|
||||
throw std::runtime_error{
|
||||
"Missing metadata for __global__ function: " + it->second};
|
||||
}
|
||||
hip_throw(std::runtime_error{
|
||||
"Missing metadata for __global__ function: " + it->second});
|
||||
}
|
||||
|
||||
std::tuple<Formals...> to_formals{std::move(actuals)};
|
||||
@@ -129,23 +122,87 @@ inline std::vector<std::uint8_t> make_kernarg(
|
||||
return make_kernarg<0>(to_formals, it1->second, std::move(kernarg));
|
||||
}
|
||||
|
||||
void hipLaunchKernelGGLImpl(std::uintptr_t function_address, const dim3& numBlocks,
|
||||
const dim3& dimBlocks, std::uint32_t sharedMemBytes, hipStream_t stream,
|
||||
void** kernarg);
|
||||
} // Namespace hip_impl.
|
||||
inline
|
||||
std::string name(std::uintptr_t function_address)
|
||||
{
|
||||
const auto it = function_names().find(function_address);
|
||||
|
||||
if (it == function_names().cend()) {
|
||||
hip_throw(std::runtime_error{
|
||||
"Invalid function passed to hipLaunchKernelGGL."});
|
||||
}
|
||||
|
||||
return it->second;
|
||||
}
|
||||
|
||||
inline
|
||||
std::string name(hsa_agent_t agent)
|
||||
{
|
||||
char n[64]{};
|
||||
hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, n);
|
||||
|
||||
return std::string{n};
|
||||
}
|
||||
|
||||
hsa_agent_t target_agent(hipStream_t stream);
|
||||
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
void hipLaunchKernelGGLImpl(
|
||||
std::uintptr_t function_address,
|
||||
const dim3& numBlocks,
|
||||
const dim3& dimBlocks,
|
||||
std::uint32_t sharedMemBytes,
|
||||
hipStream_t stream,
|
||||
void** kernarg) {
|
||||
auto it0 = functions().find(function_address);
|
||||
|
||||
if (it0 == functions().cend()) {
|
||||
hip_throw(std::runtime_error{
|
||||
"No device code available for function: " +
|
||||
name(function_address)});
|
||||
}
|
||||
|
||||
auto agent = target_agent(stream);
|
||||
|
||||
const auto it1 = std::find_if(
|
||||
it0->second.cbegin(),
|
||||
it0->second.cend(),
|
||||
[=](const std::pair<hsa_agent_t, Kernel_descriptor>& x) {
|
||||
return x.first == agent;
|
||||
});
|
||||
|
||||
if (it1 == it0->second.cend()) {
|
||||
hip_throw(std::runtime_error{
|
||||
"No code available for function: " + name(function_address) +
|
||||
", for agent: " + name(agent)});
|
||||
}
|
||||
|
||||
hipModuleLaunchKernel(it1->second, numBlocks.x, numBlocks.y, numBlocks.z,
|
||||
dimBlocks.x, dimBlocks.y, dimBlocks.z, sharedMemBytes,
|
||||
stream, nullptr, kernarg);
|
||||
}
|
||||
} // Namespace hip_impl.
|
||||
|
||||
template <typename... Args, typename F = void (*)(Args...)>
|
||||
inline void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
|
||||
std::uint32_t sharedMemBytes, hipStream_t stream, Args... args) {
|
||||
inline
|
||||
void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
|
||||
std::uint32_t sharedMemBytes, hipStream_t stream,
|
||||
Args... args) {
|
||||
auto kernarg = hip_impl::make_kernarg(
|
||||
kernel, std::tuple<Args...>{std::move(args)...});
|
||||
std::size_t kernarg_size = kernarg.size();
|
||||
|
||||
void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, kernarg.data(), HIP_LAUNCH_PARAM_BUFFER_SIZE,
|
||||
&kernarg_size, HIP_LAUNCH_PARAM_END};
|
||||
void* config[]{
|
||||
HIP_LAUNCH_PARAM_BUFFER_POINTER,
|
||||
kernarg.data(),
|
||||
HIP_LAUNCH_PARAM_BUFFER_SIZE,
|
||||
&kernarg_size,
|
||||
HIP_LAUNCH_PARAM_END};
|
||||
|
||||
hip_impl::hipLaunchKernelGGLImpl(reinterpret_cast<std::uintptr_t>(kernel), numBlocks, dimBlocks,
|
||||
sharedMemBytes, stream, &config[0]);
|
||||
hip_impl::hipLaunchKernelGGLImpl(reinterpret_cast<std::uintptr_t>(kernel),
|
||||
numBlocks, dimBlocks, sharedMemBytes,
|
||||
stream, &config[0]);
|
||||
}
|
||||
|
||||
template <typename... Args, typename F = void (*)(hipLaunchParm, Args...)>
|
||||
|
||||
@@ -35,11 +35,14 @@ THE SOFTWARE.
|
||||
#define GENERIC_GRID_LAUNCH 1
|
||||
#endif
|
||||
|
||||
#include <hsa/hsa.h>
|
||||
|
||||
#include <hip/hcc_detail/host_defines.h>
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <hip/hcc_detail/driver_types.h>
|
||||
#include <hip/hcc_detail/hip_texture_types.h>
|
||||
#include <hip/hcc_detail/hip_surface_types.h>
|
||||
#include <hip/hcc_detail/program_state.hpp>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define DEPRECATED(msg) __declspec(deprecated(msg))
|
||||
@@ -58,6 +61,11 @@ THE SOFTWARE.
|
||||
#define HIP_LAUNCH_PARAM_END ((void*)0x03)
|
||||
|
||||
#ifdef __cplusplus
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#define __dparm(x) \
|
||||
= x
|
||||
#else
|
||||
@@ -1363,6 +1371,61 @@ hipError_t hipMemcpyDtoHAsync(void* dst, hipDeviceptr_t src, size_t sizeBytes, h
|
||||
hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes,
|
||||
hipStream_t stream);
|
||||
|
||||
__attribute__((visibility("hidden")))
|
||||
hipError_t hipModuleGetGlobal(void**, size_t*, hipModule_t, const char*);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Copies the memory address of symbol @p symbolName to @p devPtr
|
||||
*
|
||||
* @param[in] symbolName - Symbol on device
|
||||
* @param[out] devPtr - Pointer to a pointer to the memory referred to by the symbol
|
||||
* @return #hipSuccess, #hipErrorNotInitialized, #hipErrorNotFound
|
||||
*
|
||||
* @see hipGetSymbolSize, hipMemcpyToSymbol, hipMemcpyFromSymbol, hipMemcpyToSymbolAsync,
|
||||
* hipMemcpyFromSymbolAsync
|
||||
*/
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
hipError_t hipGetSymbolAddress(void** devPtr, const void* symbolName) {
|
||||
//HIP_INIT_API(hipGetSymbolAddress, devPtr, symbolName);
|
||||
|
||||
size_t size = 0;
|
||||
return hipModuleGetGlobal(devPtr, &size, 0, (const char*)symbolName);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @brief Copies the size of symbol @p symbolName to @p size
|
||||
*
|
||||
* @param[in] symbolName - Symbol on device
|
||||
* @param[out] size - Pointer to the size of the symbol
|
||||
* @return #hipSuccess, #hipErrorNotInitialized, #hipErrorNotFound
|
||||
*
|
||||
* @see hipGetSymbolSize, hipMemcpyToSymbol, hipMemcpyFromSymbol, hipMemcpyToSymbolAsync,
|
||||
* hipMemcpyFromSymbolAsync
|
||||
*/
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
hipError_t hipGetSymbolSize(size_t* size, const void* symbolName) {
|
||||
// HIP_INIT_API(hipGetSymbolSize, size, symbolName);
|
||||
|
||||
void* devPtr = nullptr;
|
||||
return hipModuleGetGlobal(&devPtr, size, 0, (const char*)symbolName);
|
||||
}
|
||||
|
||||
#if defined(__cplusplus)
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
namespace hip_impl {
|
||||
hipError_t hipMemcpyToSymbol(void*, const void*, size_t, size_t, hipMemcpyKind,
|
||||
const char*);
|
||||
} // Namespace hip_impl.
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @brief Copies @p sizeBytes bytes from the memory area pointed to by @p src to the memory area
|
||||
@@ -1387,35 +1450,36 @@ hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dst, hipDeviceptr_t src, size_t siz
|
||||
* hipMemcpyFromArrayAsync, hipMemcpy2DFromArrayAsync, hipMemcpyToSymbolAsync,
|
||||
* hipMemcpyFromSymbolAsync
|
||||
*/
|
||||
hipError_t hipMemcpyToSymbol(const void* symbolName, const void* src, size_t sizeBytes,
|
||||
size_t offset __dparm(0), hipMemcpyKind kind __dparm(hipMemcpyHostToDevice));
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
hipError_t hipMemcpyToSymbol(const void* symbolName, const void* src,
|
||||
size_t sizeBytes, size_t offset __dparm(0),
|
||||
hipMemcpyKind kind __dparm(hipMemcpyHostToDevice)) {
|
||||
if (!symbolName) return hipErrorInvalidSymbol;
|
||||
|
||||
hipDeviceptr_t dst = NULL;
|
||||
hipGetSymbolAddress(&dst, (const char*)symbolName);
|
||||
|
||||
/**
|
||||
* @brief Copies the memory address of symbol @p symbolName to @p devPtr
|
||||
*
|
||||
* @param[in] symbolName - Symbol on device
|
||||
* @param[out] devPtr - Pointer to a pointer to the memory referred to by the symbol
|
||||
* @return #hipSuccess, #hipErrorNotInitialized, #hipErrorNotFound
|
||||
*
|
||||
* @see hipGetSymbolSize, hipMemcpyToSymbol, hipMemcpyFromSymbol, hipMemcpyToSymbolAsync,
|
||||
* hipMemcpyFromSymbolAsync
|
||||
*/
|
||||
hipError_t hipGetSymbolAddress(void** devPtr, const void* symbolName);
|
||||
return hip_impl::hipMemcpyToSymbol(dst, src, sizeBytes, offset, kind,
|
||||
(const char*)symbolName);
|
||||
}
|
||||
|
||||
#if defined(__cplusplus)
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @brief Copies the size of symbol @p symbolName to @p size
|
||||
*
|
||||
* @param[in] symbolName - Symbol on device
|
||||
* @param[out] size - Pointer to the size of the symbol
|
||||
* @return #hipSuccess, #hipErrorNotInitialized, #hipErrorNotFound
|
||||
*
|
||||
* @see hipGetSymbolSize, hipMemcpyToSymbol, hipMemcpyFromSymbol, hipMemcpyToSymbolAsync,
|
||||
* hipMemcpyFromSymbolAsync
|
||||
*/
|
||||
hipError_t hipGetSymbolSize(size_t* size, const void* symbolName);
|
||||
namespace hip_impl {
|
||||
hipError_t hipMemcpyToSymbolAsync(void*, const void*, size_t, size_t,
|
||||
hipMemcpyKind, hipStream_t, const char*);
|
||||
hipError_t hipMemcpyFromSymbol(void*, const void*, size_t, size_t,
|
||||
hipMemcpyKind, const char*);
|
||||
hipError_t hipMemcpyFromSymbolAsync(void*, const void*, size_t, size_t,
|
||||
hipMemcpyKind, hipStream_t, const char*);
|
||||
} // Namespace hip_impl.
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @brief Copies @p sizeBytes bytes from the memory area pointed to by @p src to the memory area
|
||||
@@ -1442,14 +1506,50 @@ hipError_t hipGetSymbolSize(size_t* size, const void* symbolName);
|
||||
* hipMemcpyFromArrayAsync, hipMemcpy2DFromArrayAsync, hipMemcpyToSymbolAsync,
|
||||
* hipMemcpyFromSymbolAsync
|
||||
*/
|
||||
hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void* src, size_t sizeBytes,
|
||||
size_t offset, hipMemcpyKind kind, hipStream_t stream __dparm(0));
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void* src,
|
||||
size_t sizeBytes, size_t offset,
|
||||
hipMemcpyKind kind, hipStream_t stream __dparm(0)) {
|
||||
if (!symbolName) return hipErrorInvalidSymbol;
|
||||
|
||||
hipError_t hipMemcpyFromSymbol(void* dst, const void* symbolName, size_t sizeBytes,
|
||||
size_t offset __dparm(0), hipMemcpyKind kind __dparm( hipMemcpyDeviceToHost ));
|
||||
hipDeviceptr_t dst = NULL;
|
||||
hipGetSymbolAddress(&dst, symbolName);
|
||||
|
||||
hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName, size_t sizeBytes,
|
||||
size_t offset, hipMemcpyKind kind, hipStream_t stream __dparm(0));
|
||||
return hip_impl::hipMemcpyToSymbolAsync(dst, src, sizeBytes, offset, kind,
|
||||
stream,
|
||||
(const char*)symbolName);
|
||||
}
|
||||
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
hipError_t hipMemcpyFromSymbol(void* dst, const void* symbolName,
|
||||
size_t sizeBytes, size_t offset __dparm(0),
|
||||
hipMemcpyKind kind __dparm(hipMemcpyDeviceToHost)) {
|
||||
if (!symbolName) return hipErrorInvalidSymbol;
|
||||
|
||||
hipDeviceptr_t src = NULL;
|
||||
hipGetSymbolAddress(&src, symbolName);
|
||||
|
||||
return hip_impl::hipMemcpyFromSymbol(dst, src, sizeBytes, offset, kind,
|
||||
(const char*)symbolName);
|
||||
}
|
||||
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName,
|
||||
size_t sizeBytes, size_t offset,
|
||||
hipMemcpyKind kind,
|
||||
hipStream_t stream __dparm(0)) {
|
||||
if (!symbolName) return hipErrorInvalidSymbol;
|
||||
|
||||
hipDeviceptr_t src = NULL;
|
||||
hipGetSymbolAddress(&src, symbolName);
|
||||
|
||||
return hip_impl::hipMemcpyFromSymbolAsync(dst, src, sizeBytes, offset, kind,
|
||||
stream,
|
||||
(const char*)symbolName);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Copy data from src to dst asynchronously.
|
||||
@@ -2397,6 +2497,103 @@ hipError_t hipModuleGetFunction(hipFunction_t* function, hipModule_t module, con
|
||||
|
||||
hipError_t hipFuncGetAttributes(hipFuncAttributes* attr, const void* func);
|
||||
|
||||
struct Agent_global {
|
||||
std::string name;
|
||||
hipDeviceptr_t address;
|
||||
uint32_t byte_cnt;
|
||||
};
|
||||
#if defined(__cplusplus)
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
namespace hip_impl {
|
||||
hsa_executable_t executable_for(hipModule_t);
|
||||
const std::string& hash_for(hipModule_t);
|
||||
|
||||
template<typename ForwardIterator>
|
||||
std::pair<hipDeviceptr_t, std::size_t> read_global_description(
|
||||
ForwardIterator f, ForwardIterator l, const char* name) {
|
||||
const auto it = std::find_if(f, l, [=](const Agent_global& x) {
|
||||
return x.name == name;
|
||||
});
|
||||
|
||||
return it == l ?
|
||||
std::make_pair(nullptr, 0u) : std::make_pair(it->address, it->byte_cnt);
|
||||
}
|
||||
|
||||
std::vector<Agent_global> read_agent_globals(hsa_agent_t agent,
|
||||
hsa_executable_t executable);
|
||||
hsa_agent_t this_agent();
|
||||
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
hipError_t read_agent_global_from_module(hipDeviceptr_t* dptr, size_t* bytes,
|
||||
hipModule_t hmod, const char* name) {
|
||||
// the key of the map would the hash of code object associated with the
|
||||
// hipModule_t instance
|
||||
static std::unordered_map<
|
||||
std::string, std::vector<Agent_global>> agent_globals;
|
||||
auto key = hash_for(hmod);
|
||||
|
||||
if (agent_globals.count(key) == 0) {
|
||||
static std::mutex mtx;
|
||||
std::lock_guard<std::mutex> lck{mtx};
|
||||
|
||||
if (agent_globals.count(key) == 0) {
|
||||
agent_globals.emplace(
|
||||
key, read_agent_globals(this_agent(), executable_for(hmod)));
|
||||
}
|
||||
}
|
||||
|
||||
const auto it0 = agent_globals.find(key);
|
||||
if (it0 == agent_globals.cend()) {
|
||||
hip_throw(
|
||||
std::runtime_error{"agent_globals data structure corrupted."});
|
||||
}
|
||||
|
||||
std::tie(*dptr, *bytes) = read_global_description(it0->second.cbegin(),
|
||||
it0->second.cend(), name);
|
||||
|
||||
return *dptr ? hipSuccess : hipErrorNotFound;
|
||||
}
|
||||
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
hipError_t read_agent_global_from_process(hipDeviceptr_t* dptr, size_t* bytes,
|
||||
const char* name) {
|
||||
static std::unordered_map<
|
||||
hsa_agent_t, std::vector<Agent_global>> agent_globals;
|
||||
static std::once_flag f;
|
||||
|
||||
std::call_once(f, []() {
|
||||
for (auto&& agent_executables : executables()) {
|
||||
std::vector<Agent_global> tmp0;
|
||||
for (auto&& executable : agent_executables.second) {
|
||||
auto tmp1 = read_agent_globals(agent_executables.first,
|
||||
executable);
|
||||
|
||||
tmp0.insert(tmp0.end(), make_move_iterator(tmp1.begin()),
|
||||
make_move_iterator(tmp1.end()));
|
||||
}
|
||||
agent_globals.emplace(agent_executables.first, move(tmp0));
|
||||
}
|
||||
});
|
||||
|
||||
const auto it = agent_globals.find(this_agent());
|
||||
|
||||
if (it == agent_globals.cend()) return hipErrorNotInitialized;
|
||||
|
||||
std::tie(*dptr, *bytes) = read_global_description(it->second.cbegin(),
|
||||
it->second.cend(), name);
|
||||
|
||||
return *dptr ? hipSuccess : hipErrorNotFound;
|
||||
}
|
||||
} // Namespace hip_impl.
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @brief returns device memory pointer and size of the kernel present in the module with symbol @p
|
||||
* name
|
||||
@@ -2408,11 +2605,20 @@ hipError_t hipFuncGetAttributes(hipFuncAttributes* attr, const void* func);
|
||||
*
|
||||
* @returns hipSuccess, hipErrorInvalidValue, hipErrorNotInitialized
|
||||
*/
|
||||
hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes, hipModule_t hmod,
|
||||
const char* name);
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes,
|
||||
hipModule_t hmod, const char* name) {
|
||||
if (!dptr || !bytes) return hipErrorInvalidValue;
|
||||
|
||||
hipError_t ihipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes, hipModule_t hmod,
|
||||
const char* name);
|
||||
if (!name) return hipErrorNotInitialized;
|
||||
|
||||
const auto r = hmod ?
|
||||
hip_impl::read_agent_global_from_module(dptr, bytes, hmod, name) :
|
||||
hip_impl::read_agent_global_from_process(dptr, bytes, name);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const char* name);
|
||||
/**
|
||||
|
||||
@@ -27,15 +27,6 @@ THE SOFTWARE.
|
||||
#include <functional>
|
||||
#include <string>
|
||||
|
||||
inline constexpr bool operator==(hsa_isa_t x, hsa_isa_t y) { return x.handle == y.handle; }
|
||||
|
||||
namespace std {
|
||||
template <>
|
||||
struct hash<hsa_isa_t> {
|
||||
size_t operator()(hsa_isa_t x) const { return hash<decltype(x.handle)>{}(x.handle); }
|
||||
};
|
||||
} // namespace std
|
||||
|
||||
namespace hip_impl {
|
||||
inline void* address(hsa_executable_symbol_t x) {
|
||||
void* r = nullptr;
|
||||
@@ -85,7 +85,7 @@ requires(Domain<K> ==
|
||||
hc::parallel_for_each(acc_v, d, k);
|
||||
} catch (std::exception& ex) {
|
||||
std::cerr << "Failed in " << __func__ << ", with exception: " << ex.what() << std::endl;
|
||||
throw;
|
||||
hip_throw(ex);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -113,7 +113,7 @@ requires(Domain<K> == {Ts...}) inline void grid_launch_hip_impl_(New_grid_launch
|
||||
group_mem_bytes, acc_v, std::move(k));
|
||||
} catch (std::exception& ex) {
|
||||
std::cerr << "Failed in " << __func__ << ", with exception: " << ex.what() << std::endl;
|
||||
throw;
|
||||
hip_throw(ex);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -22,14 +22,35 @@ THE SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "code_object_bundle.hpp"
|
||||
#include "hsa_helpers.hpp"
|
||||
|
||||
#if !defined(__cpp_exceptions)
|
||||
#define try if (true)
|
||||
#define catch(...) if (false)
|
||||
#endif
|
||||
#include "elfio/elfio.hpp"
|
||||
#if !defined(__cpp_exceptions)
|
||||
#undef try
|
||||
#undef catch
|
||||
#endif
|
||||
|
||||
#include <hsa/amd_hsa_kernel_code.h>
|
||||
#include <hsa/hsa.h>
|
||||
#include <hsa/hsa_ext_amd.h>
|
||||
#include <hsa/hsa_ven_amd_loader.h>
|
||||
|
||||
#include <link.h>
|
||||
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <istream>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
@@ -39,13 +60,27 @@ struct ihipModuleSymbol_t;
|
||||
using hipFunction_t = ihipModuleSymbol_t*;
|
||||
|
||||
namespace std {
|
||||
template <>
|
||||
template<>
|
||||
struct hash<hsa_agent_t> {
|
||||
size_t operator()(hsa_agent_t x) const { return hash<decltype(x.handle)>{}(x.handle); }
|
||||
size_t operator()(hsa_agent_t x) const {
|
||||
return hash<decltype(x.handle)>{}(x.handle);
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct hash<hsa_isa_t> {
|
||||
size_t operator()(hsa_isa_t x) const {
|
||||
return hash<decltype(x.handle)>{}(x.handle);
|
||||
}
|
||||
};
|
||||
} // namespace std
|
||||
|
||||
inline constexpr bool operator==(hsa_agent_t x, hsa_agent_t y) { return x.handle == y.handle; }
|
||||
inline constexpr bool operator==(hsa_agent_t x, hsa_agent_t y) {
|
||||
return x.handle == y.handle;
|
||||
}
|
||||
inline constexpr bool operator==(hsa_isa_t x, hsa_isa_t y) {
|
||||
return x.handle == y.handle;
|
||||
}
|
||||
|
||||
namespace hip_impl {
|
||||
class Kernel_descriptor {
|
||||
@@ -93,16 +128,517 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
const std::unordered_map<hsa_agent_t, std::vector<hsa_executable_t>>& executables(
|
||||
bool rebuild = false);
|
||||
const std::unordered_map<std::uintptr_t, std::vector<std::pair<hsa_agent_t, Kernel_descriptor>>>&
|
||||
functions(bool rebuild = false);
|
||||
const std::unordered_map<std::uintptr_t, std::string>& function_names(bool rebuild = false);
|
||||
std::unordered_map<std::string, void*>& globals(bool rebuild = false);
|
||||
const std::unordered_map<
|
||||
std::string, std::vector<std::pair<std::size_t, std::size_t>>>&
|
||||
kernargs(bool rebuild = false);
|
||||
template<typename P>
|
||||
inline
|
||||
ELFIO::section* find_section_if(ELFIO::elfio& reader, P p) {
|
||||
const auto it = std::find_if(
|
||||
reader.sections.begin(), reader.sections.end(), std::move(p));
|
||||
|
||||
hsa_executable_t load_executable(const std::string& file, hsa_executable_t executable,
|
||||
hsa_agent_t agent);
|
||||
return it != reader.sections.end() ? *it : nullptr;
|
||||
}
|
||||
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
const std::unordered_map<
|
||||
hsa_isa_t, std::vector<std::vector<char>>>& code_object_blobs() {
|
||||
static std::unordered_map<hsa_isa_t, std::vector<std::vector<char>>> r;
|
||||
static std::once_flag f;
|
||||
|
||||
std::call_once(f, []() {
|
||||
static std::vector<std::vector<char>> blobs{};
|
||||
|
||||
dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void*) {
|
||||
ELFIO::elfio tmp;
|
||||
|
||||
const auto elf =
|
||||
info->dlpi_addr ? info->dlpi_name : "/proc/self/exe";
|
||||
|
||||
if (!tmp.load(elf)) return 0;
|
||||
|
||||
const auto it = find_section_if(tmp, [](const ELFIO::section* x) {
|
||||
return x->get_name() == ".kernel";
|
||||
});
|
||||
|
||||
if (!it) return 0;
|
||||
|
||||
blobs.emplace_back(it->get_data(), it->get_data() + it->get_size());
|
||||
|
||||
return 0;
|
||||
}, nullptr);
|
||||
|
||||
for (auto&& multi_arch_blob : blobs) {
|
||||
auto it = multi_arch_blob.begin();
|
||||
while (it != multi_arch_blob.end()) {
|
||||
Bundled_code_header tmp{it, multi_arch_blob.end()};
|
||||
|
||||
if (!valid(tmp)) break;
|
||||
|
||||
for (auto&& bundle : bundles(tmp)) {
|
||||
r[triple_to_hsa_isa(bundle.triple)].push_back(bundle.blob);
|
||||
}
|
||||
|
||||
it += tmp.bundled_code_size;
|
||||
};
|
||||
}
|
||||
});
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
struct Symbol {
|
||||
std::string name;
|
||||
ELFIO::Elf64_Addr value = 0;
|
||||
ELFIO::Elf_Xword size = 0;
|
||||
ELFIO::Elf_Half sect_idx = 0;
|
||||
std::uint8_t bind = 0;
|
||||
std::uint8_t type = 0;
|
||||
std::uint8_t other = 0;
|
||||
};
|
||||
|
||||
inline
|
||||
Symbol read_symbol(const ELFIO::symbol_section_accessor& section,
|
||||
unsigned int idx) {
|
||||
assert(idx < section.get_symbols_num());
|
||||
|
||||
Symbol r;
|
||||
section.get_symbol(
|
||||
idx, r.name, r.value, r.size, r.bind, r.type, r.sect_idx, r.other);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
const std::unordered_map<
|
||||
std::string,
|
||||
std::pair<ELFIO::Elf64_Addr, ELFIO::Elf_Xword>>& symbol_addresses() {
|
||||
static std::unordered_map<
|
||||
std::string, std::pair<ELFIO::Elf64_Addr, ELFIO::Elf_Xword>> r;
|
||||
static std::once_flag f;
|
||||
|
||||
std::call_once(f, []() {
|
||||
dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void*) {
|
||||
ELFIO::elfio tmp;
|
||||
const auto elf =
|
||||
info->dlpi_addr ? info->dlpi_name : "/proc/self/exe";
|
||||
|
||||
if (!tmp.load(elf)) return 0;
|
||||
|
||||
auto it = find_section_if(tmp, [](const ELFIO::section* x) {
|
||||
return x->get_type() == SHT_SYMTAB;
|
||||
});
|
||||
|
||||
if (!it) return 0;
|
||||
|
||||
const ELFIO::symbol_section_accessor symtab{tmp, it};
|
||||
|
||||
for (auto i = 0u; i != symtab.get_symbols_num(); ++i) {
|
||||
auto s = read_symbol(symtab, i);
|
||||
|
||||
if (s.type != STT_OBJECT || s.sect_idx == SHN_UNDEF) continue;
|
||||
|
||||
const auto addr = s.value + info->dlpi_addr;
|
||||
r.emplace(std::move(s.name), std::make_pair(addr, s.size));
|
||||
}
|
||||
|
||||
return 0;
|
||||
}, nullptr);
|
||||
});
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
std::unordered_map<std::string, void*>& globals() {
|
||||
static std::unordered_map<std::string, void*> r;
|
||||
static std::once_flag f;
|
||||
|
||||
std::call_once(f, []() { r.reserve(symbol_addresses().size()); });
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
std::vector<std::string> copy_names_of_undefined_symbols(
|
||||
const ELFIO::symbol_section_accessor& section) {
|
||||
std::vector<std::string> r;
|
||||
|
||||
for (auto i = 0u; i != section.get_symbols_num(); ++i) {
|
||||
// TODO: this is boyscout code, caching the temporaries
|
||||
// may be of worth.
|
||||
auto tmp = read_symbol(section, i);
|
||||
if (tmp.sect_idx != SHN_UNDEF || tmp.name.empty()) continue;
|
||||
|
||||
r.push_back(std::move(tmp.name));
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
[[noreturn]]
|
||||
void hip_throw(const std::exception&);
|
||||
|
||||
inline
|
||||
void associate_code_object_symbols_with_host_allocation(
|
||||
const ELFIO::elfio& reader,
|
||||
ELFIO::section* code_object_dynsym,
|
||||
hsa_agent_t agent,
|
||||
hsa_executable_t executable) {
|
||||
if (!code_object_dynsym) return;
|
||||
|
||||
const auto undefined_symbols = copy_names_of_undefined_symbols(
|
||||
ELFIO::symbol_section_accessor{reader, code_object_dynsym});
|
||||
|
||||
for (auto&& x : undefined_symbols) {
|
||||
if (globals().find(x) != globals().cend()) return;
|
||||
|
||||
const auto it1 = symbol_addresses().find(x);
|
||||
|
||||
if (it1 == symbol_addresses().cend()) {
|
||||
hip_throw(std::runtime_error{
|
||||
"Global symbol: " + x + " is undefined."});
|
||||
}
|
||||
|
||||
static std::mutex mtx;
|
||||
std::lock_guard<std::mutex> lck{mtx};
|
||||
|
||||
if (globals().find(x) != globals().cend()) return;
|
||||
|
||||
globals().emplace(x, (void*)(it1->second.first));
|
||||
void* p = nullptr;
|
||||
hsa_amd_memory_lock(
|
||||
reinterpret_cast<void*>(it1->second.first),
|
||||
it1->second.second,
|
||||
nullptr, // All agents.
|
||||
0,
|
||||
&p);
|
||||
|
||||
hsa_executable_agent_global_variable_define(
|
||||
executable, agent, x.c_str(), p);
|
||||
}
|
||||
}
|
||||
|
||||
inline
|
||||
void load_code_object_and_freeze_executable(
|
||||
const std::string& file, hsa_agent_t agent, hsa_executable_t executable) {
|
||||
// TODO: the following sequence is inefficient, should be refactored
|
||||
// into a single load of the file and subsequent ELFIO
|
||||
// processing.
|
||||
static const auto cor_deleter = [](hsa_code_object_reader_t* p) {
|
||||
if (!p) return;
|
||||
|
||||
hsa_code_object_reader_destroy(*p);
|
||||
delete p;
|
||||
};
|
||||
|
||||
using RAII_code_reader =
|
||||
std::unique_ptr<hsa_code_object_reader_t, decltype(cor_deleter)>;
|
||||
|
||||
if (file.empty()) return;
|
||||
|
||||
RAII_code_reader tmp{new hsa_code_object_reader_t, cor_deleter};
|
||||
hsa_code_object_reader_create_from_memory(
|
||||
file.data(), file.size(), tmp.get());
|
||||
|
||||
hsa_executable_load_agent_code_object(
|
||||
executable, agent, *tmp, nullptr, nullptr);
|
||||
|
||||
hsa_executable_freeze(executable, nullptr);
|
||||
|
||||
static std::vector<RAII_code_reader> code_readers;
|
||||
static std::mutex mtx;
|
||||
|
||||
std::lock_guard<std::mutex> lck{mtx};
|
||||
code_readers.push_back(move(tmp));
|
||||
}
|
||||
|
||||
inline
|
||||
hsa_executable_t load_executable(const std::string& file,
|
||||
hsa_executable_t executable,
|
||||
hsa_agent_t agent) {
|
||||
ELFIO::elfio reader;
|
||||
std::stringstream tmp{file};
|
||||
|
||||
if (!reader.load(tmp)) return hsa_executable_t{};
|
||||
|
||||
const auto code_object_dynsym = find_section_if(
|
||||
reader, [](const ELFIO::section* x) {
|
||||
return x->get_type() == SHT_DYNSYM;
|
||||
});
|
||||
|
||||
associate_code_object_symbols_with_host_allocation(reader,
|
||||
code_object_dynsym,
|
||||
agent, executable);
|
||||
|
||||
load_code_object_and_freeze_executable(file, agent, executable);
|
||||
|
||||
return executable;
|
||||
}
|
||||
|
||||
std::vector<hsa_agent_t> all_hsa_agents();
|
||||
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
const std::unordered_map<
|
||||
hsa_agent_t, std::vector<hsa_executable_t>>& executables() {
|
||||
static std::unordered_map<hsa_agent_t, std::vector<hsa_executable_t>> r;
|
||||
static std::once_flag f;
|
||||
|
||||
std::call_once(f, []() {
|
||||
for (auto&& agent : hip_impl::all_hsa_agents()) {
|
||||
hsa_agent_iterate_isas(agent, [](hsa_isa_t x, void* pa) {
|
||||
const auto it = code_object_blobs().find(x);
|
||||
|
||||
if (it == code_object_blobs().cend()) return HSA_STATUS_SUCCESS;
|
||||
|
||||
hsa_agent_t a = *static_cast<hsa_agent_t*>(pa);
|
||||
|
||||
for (auto&& blob : it->second) {
|
||||
hsa_executable_t tmp = {};
|
||||
|
||||
hsa_executable_create_alt(
|
||||
HSA_PROFILE_FULL,
|
||||
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
|
||||
nullptr,
|
||||
&tmp);
|
||||
|
||||
// TODO: this is massively inefficient and only meant for
|
||||
// illustration.
|
||||
std::string blob_to_str{blob.cbegin(), blob.cend()};
|
||||
tmp = load_executable(blob_to_str, tmp, a);
|
||||
|
||||
if (tmp.handle) r[a].push_back(tmp);
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}, &agent);
|
||||
}
|
||||
});
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
std::vector<std::pair<std::uintptr_t, std::string>> function_names_for(
|
||||
const ELFIO::elfio& reader, ELFIO::section* symtab) {
|
||||
std::vector<std::pair<std::uintptr_t, std::string>> r;
|
||||
ELFIO::symbol_section_accessor symbols{reader, symtab};
|
||||
|
||||
for (auto i = 0u; i != symbols.get_symbols_num(); ++i) {
|
||||
// TODO: this is boyscout code, caching the temporaries
|
||||
// may be of worth.
|
||||
auto tmp = read_symbol(symbols, i);
|
||||
|
||||
if (tmp.type != STT_FUNC) continue;
|
||||
if (tmp.type == SHN_UNDEF) continue;
|
||||
if (tmp.name.empty()) continue;
|
||||
|
||||
r.emplace_back(tmp.value, tmp.name);
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
const std::unordered_map<std::uintptr_t, std::string>& function_names() {
|
||||
static std::unordered_map<std::uintptr_t, std::string> r;
|
||||
static std::once_flag f;
|
||||
|
||||
std::call_once(f, []() {
|
||||
dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void*) {
|
||||
ELFIO::elfio tmp;
|
||||
const auto elf =
|
||||
info->dlpi_addr ? info->dlpi_name : "/proc/self/exe";
|
||||
|
||||
if (!tmp.load(elf)) return 0;
|
||||
|
||||
const auto it = find_section_if(tmp, [](const ELFIO::section* x) {
|
||||
return x->get_type() == SHT_SYMTAB;
|
||||
});
|
||||
|
||||
if (!it) return 0;
|
||||
|
||||
auto names = function_names_for(tmp, it);
|
||||
for (auto&& x : names) x.first += info->dlpi_addr;
|
||||
|
||||
r.insert(
|
||||
std::make_move_iterator(names.begin()),
|
||||
std::make_move_iterator(names.end()));
|
||||
|
||||
return 0;
|
||||
}, nullptr);
|
||||
});
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
const std::unordered_map<
|
||||
std::string, std::vector<hsa_executable_symbol_t>>& kernels() {
|
||||
static std::unordered_map<
|
||||
std::string, std::vector<hsa_executable_symbol_t>> r;
|
||||
static std::once_flag f;
|
||||
|
||||
std::call_once(f, []() {
|
||||
static const auto copy_kernels = [](
|
||||
hsa_executable_t, hsa_agent_t, hsa_executable_symbol_t x, void*) {
|
||||
if (type(x) == HSA_SYMBOL_KIND_KERNEL) r[name(x)].push_back(x);
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
};
|
||||
|
||||
for (auto&& agent_executables : executables()) {
|
||||
for (auto&& executable : agent_executables.second) {
|
||||
hsa_executable_iterate_agent_symbols(
|
||||
executable, agent_executables.first, copy_kernels, nullptr);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
const std::unordered_map<
|
||||
std::uintptr_t,
|
||||
std::vector<std::pair<hsa_agent_t, Kernel_descriptor>>>& functions() {
|
||||
static std::unordered_map<
|
||||
std::uintptr_t,
|
||||
std::vector<std::pair<hsa_agent_t, Kernel_descriptor>>> r;
|
||||
static std::once_flag f;
|
||||
|
||||
std::call_once(f, []() {
|
||||
for (auto&& function : function_names()) {
|
||||
const auto it = kernels().find(function.second);
|
||||
|
||||
if (it == kernels().cend()) continue;
|
||||
|
||||
for (auto&& kernel_symbol : it->second) {
|
||||
r[function.first].emplace_back(
|
||||
agent(kernel_symbol),
|
||||
Kernel_descriptor{kernel_object(kernel_symbol), it->first});
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
std::size_t parse_args(
|
||||
const std::string& metadata,
|
||||
std::size_t f,
|
||||
std::size_t l,
|
||||
std::vector<std::pair<std::size_t, std::size_t>>& size_align) {
|
||||
if (f == l) return f;
|
||||
if (!size_align.empty()) return l;
|
||||
|
||||
do {
|
||||
static constexpr size_t size_sz{5};
|
||||
f = metadata.find("Size:", f) + size_sz;
|
||||
|
||||
if (l <= f) return f;
|
||||
|
||||
auto size = std::strtoul(&metadata[f], nullptr, 10);
|
||||
|
||||
static constexpr size_t align_sz{6};
|
||||
f = metadata.find("Align:", f) + align_sz;
|
||||
|
||||
char* l{};
|
||||
auto align = std::strtoul(&metadata[f], &l, 10);
|
||||
|
||||
f += (l - &metadata[f]) + 1;
|
||||
|
||||
size_align.emplace_back(size, align);
|
||||
} while (true);
|
||||
}
|
||||
|
||||
inline
|
||||
void read_kernarg_metadata(
|
||||
ELFIO::elfio& reader,
|
||||
std::unordered_map<
|
||||
std::string,
|
||||
std::vector<std::pair<std::size_t, std::size_t>>>& kernargs) {
|
||||
// TODO: this is inefficient.
|
||||
auto it = find_section_if(reader, [](const ELFIO::section* x) {
|
||||
return x->get_type() == SHT_NOTE;
|
||||
});
|
||||
|
||||
if (!it) return;
|
||||
|
||||
const ELFIO::note_section_accessor acc{reader, it};
|
||||
for (decltype(acc.get_notes_num()) i = 0; i != acc.get_notes_num(); ++i) {
|
||||
ELFIO::Elf_Word type{};
|
||||
std::string name{};
|
||||
void* desc{};
|
||||
ELFIO::Elf_Word desc_size{};
|
||||
|
||||
acc.get_note(i, type, name, desc, desc_size);
|
||||
|
||||
if (name != "AMD") continue; // TODO: switch to using NT_AMD_AMDGPU_HSA_METADATA.
|
||||
|
||||
std::string tmp{
|
||||
static_cast<char*>(desc), static_cast<char*>(desc) + desc_size};
|
||||
|
||||
auto dx = tmp.find("Kernels:");
|
||||
|
||||
if (dx == std::string::npos) continue;
|
||||
|
||||
static constexpr decltype(tmp.size()) kernels_sz{8};
|
||||
dx += kernels_sz;
|
||||
|
||||
do {
|
||||
dx = tmp.find("Name:", dx);
|
||||
|
||||
if (dx == std::string::npos) break;
|
||||
|
||||
static constexpr decltype(tmp.size()) name_sz{5};
|
||||
dx = tmp.find_first_not_of(" '", dx + name_sz);
|
||||
|
||||
auto fn = tmp.substr(dx, tmp.find_first_of("'\n", dx) - dx);
|
||||
dx += fn.size();
|
||||
|
||||
auto dx1 = tmp.find("CodeProps", dx);
|
||||
dx = tmp.find("Args:", dx);
|
||||
|
||||
if (dx1 < dx) {
|
||||
dx = dx1;
|
||||
continue;
|
||||
}
|
||||
if (dx == std::string::npos) break;
|
||||
|
||||
static constexpr decltype(tmp.size()) args_sz{5};
|
||||
dx = parse_args(tmp, dx + args_sz, dx1, kernargs[fn]);
|
||||
} while (true);
|
||||
}
|
||||
}
|
||||
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
const std::unordered_map<
|
||||
std::string, std::vector<std::pair<std::size_t, std::size_t>>>& kernargs() {
|
||||
static std::unordered_map<
|
||||
std::string, std::vector<std::pair<std::size_t, std::size_t>>> r;
|
||||
static std::once_flag f;
|
||||
|
||||
std::call_once(f, []() {
|
||||
for (auto&& isa_blobs : code_object_blobs()) {
|
||||
for (auto&& blob : isa_blobs.second) {
|
||||
std::stringstream tmp{std::string{blob.cbegin(), blob.cend()}};
|
||||
|
||||
ELFIO::elfio reader;
|
||||
|
||||
if (!reader.load(tmp)) continue;
|
||||
|
||||
read_kernarg_metadata(reader, r);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return r;
|
||||
}
|
||||
} // Namespace hip_impl.
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
|
||||
#include "clara/clara.hpp"
|
||||
#include "pstreams/pstream.h"
|
||||
#include "../src/elfio/elfio.hpp"
|
||||
#include "../include/hip/hcc_detail/elfio/elfio.hpp"
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
|
||||
@@ -41,99 +41,20 @@ using namespace std;
|
||||
|
||||
namespace hip_impl
|
||||
{
|
||||
namespace
|
||||
hsa_agent_t target_agent(hipStream_t stream)
|
||||
{
|
||||
inline
|
||||
string name(uintptr_t function_address)
|
||||
{
|
||||
const auto it = function_names().find(function_address);
|
||||
|
||||
if (it == function_names().cend()) {
|
||||
throw runtime_error{
|
||||
"Invalid function passed to hipLaunchKernelGGL."};
|
||||
}
|
||||
|
||||
return it->second;
|
||||
if (stream) {
|
||||
return *static_cast<hsa_agent_t*>(
|
||||
stream->locked_getAv()->get_hsa_agent());
|
||||
}
|
||||
|
||||
inline
|
||||
string name(hsa_agent_t agent)
|
||||
{
|
||||
char n[64] = {};
|
||||
hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, n);
|
||||
|
||||
return string{n};
|
||||
else if (
|
||||
ihipGetTlsDefaultCtx() && ihipGetTlsDefaultCtx()->getDevice()) {
|
||||
return ihipGetDevice(
|
||||
ihipGetTlsDefaultCtx()->getDevice()->_deviceId)->_hsaAgent;
|
||||
}
|
||||
|
||||
inline
|
||||
hsa_agent_t target_agent(hipStream_t stream)
|
||||
{
|
||||
if (stream) {
|
||||
return *static_cast<hsa_agent_t*>(
|
||||
stream->locked_getAv()->get_hsa_agent());
|
||||
}
|
||||
else if (
|
||||
ihipGetTlsDefaultCtx() && ihipGetTlsDefaultCtx()->getDevice()) {
|
||||
return ihipGetDevice(
|
||||
ihipGetTlsDefaultCtx()->getDevice()->_deviceId)->_hsaAgent;
|
||||
}
|
||||
else {
|
||||
return *static_cast<hsa_agent_t*>(
|
||||
accelerator{}.get_default_view().get_hsa_agent());
|
||||
}
|
||||
else {
|
||||
return *static_cast<hsa_agent_t*>(
|
||||
accelerator{}.get_default_view().get_hsa_agent());
|
||||
}
|
||||
}
|
||||
|
||||
void hipLaunchKernelGGLImpl(
|
||||
uintptr_t function_address,
|
||||
const dim3& numBlocks,
|
||||
const dim3& dimBlocks,
|
||||
uint32_t sharedMemBytes,
|
||||
hipStream_t stream,
|
||||
void** kernarg)
|
||||
{
|
||||
auto it0 = functions().find(function_address);
|
||||
|
||||
if (it0 == functions().cend()) {
|
||||
// Re-init device code maps once again to help locate kernels
|
||||
// loaded after HIP runtime initialization via means such as
|
||||
// dlopen().
|
||||
it0 = functions(true).find(function_address);
|
||||
if (it0 == functions().cend()) {
|
||||
throw runtime_error{
|
||||
"No device code available for function: " +
|
||||
name(function_address)
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
auto agent = target_agent(stream);
|
||||
|
||||
const auto it1 = find_if(
|
||||
it0->second.cbegin(),
|
||||
it0->second.cend(),
|
||||
[=](const pair<hsa_agent_t, Kernel_descriptor>& x) {
|
||||
return x.first == agent;
|
||||
});
|
||||
|
||||
if (it1 == it0->second.cend()) {
|
||||
throw runtime_error{
|
||||
"No code available for function: " + name(function_address) +
|
||||
", for agent: " + name(agent)
|
||||
};
|
||||
}
|
||||
|
||||
hipModuleLaunchKernel(
|
||||
it1->second,
|
||||
numBlocks.x,
|
||||
numBlocks.y,
|
||||
numBlocks.z,
|
||||
dimBlocks.x,
|
||||
dimBlocks.y,
|
||||
dimBlocks.z,
|
||||
sharedMemBytes,
|
||||
stream,
|
||||
nullptr,
|
||||
kernarg);
|
||||
}
|
||||
}
|
||||
|
||||
+28
-2
@@ -27,6 +27,7 @@ THE SOFTWARE.
|
||||
* everywhere. This file is compiled and linked into apps running HIP / HCC path.
|
||||
*/
|
||||
#include <assert.h>
|
||||
#include <exception>
|
||||
#include <stdint.h>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
@@ -237,7 +238,7 @@ hipError_t ihipSynchronize(void) {
|
||||
//=================================================================================================
|
||||
TidInfo::TidInfo() : _apiSeqNum(0) {
|
||||
_shortTid = g_lastShortTid.fetch_add(1);
|
||||
_pid = getpid();
|
||||
_pid = getpid();
|
||||
|
||||
if (COMPILE_HIP_DB && HIP_TRACE_API) {
|
||||
std::stringstream tid_ss;
|
||||
@@ -2397,7 +2398,7 @@ void ihipStream_t::locked_copy2DAsync(void* dst, const void* src, size_t width,
|
||||
crit->_av.copy2d_ext(src, dst, width, height, srcPitch, dstPitch, hcCopyDir, srcPtrInfo, dstPtrInfo,
|
||||
copyDevice ? ©Device->getDevice()->_acc : nullptr,
|
||||
forceUnpinnedCopy);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//-------------------------------------------------------------------------------------------------
|
||||
@@ -2460,3 +2461,28 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view** a
|
||||
|
||||
//// TODO - add identifier numbers for streams and devices to help with debugging.
|
||||
// TODO - add a contect sequence number for debug. Print operator<< ctx:0.1 (device.ctx)
|
||||
|
||||
namespace hip_impl {
|
||||
std::vector<hsa_agent_t> all_hsa_agents() {
|
||||
std::vector<hsa_agent_t> r{};
|
||||
for (auto&& acc : hc::accelerator::get_all()) {
|
||||
const auto agent = acc.get_hsa_agent();
|
||||
|
||||
if (!agent || !acc.is_hsa_accelerator()) continue;
|
||||
|
||||
r.emplace_back(*static_cast<hsa_agent_t*>(agent));
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
[[noreturn]]
|
||||
void hip_throw(const std::exception& ex) {
|
||||
#if defined(__cpp_exceptions)
|
||||
throw ex;
|
||||
#else
|
||||
std::cerr << ex.what() << std::endl;
|
||||
std::terminate();
|
||||
#endif
|
||||
}
|
||||
} // Namespace hip_impl.
|
||||
+28
-92
@@ -955,37 +955,14 @@ hipError_t hipHostUnregister(void* hostPtr) {
|
||||
return ihipLogStatus(hip_status);
|
||||
}
|
||||
|
||||
namespace {
|
||||
inline hipDeviceptr_t agent_address_for_symbol(const char* symbolName) {
|
||||
hipDeviceptr_t r = nullptr;
|
||||
namespace hip_impl {
|
||||
hipError_t hipMemcpyToSymbol(void* dst, const void* src, size_t count,
|
||||
size_t offset, hipMemcpyKind kind,
|
||||
const char* symbol_name) {
|
||||
HIP_INIT_SPECIAL_API(hipMemcpyToSymbol, (TRACE_MCMD), symbol_name, src,
|
||||
count, offset, kind);
|
||||
|
||||
#if __hcc_workweek__ >= 17481
|
||||
size_t byte_cnt = 0u;
|
||||
ihipModuleGetGlobal(&r, &byte_cnt, 0, symbolName);
|
||||
#else
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
auto acc = ctx->getDevice()->_acc;
|
||||
r = acc.get_symbol_address(symbolName);
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
hipError_t hipMemcpyToSymbol(const void* symbolName, const void* src, size_t count, size_t offset,
|
||||
hipMemcpyKind kind) {
|
||||
HIP_INIT_SPECIAL_API(hipMemcpyToSymbol, (TRACE_MCMD), symbolName, src, count, offset, kind);
|
||||
|
||||
if (symbolName == nullptr) {
|
||||
return ihipLogStatus(hipErrorInvalidSymbol);
|
||||
}
|
||||
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
|
||||
hc::accelerator acc = ctx->getDevice()->_acc;
|
||||
|
||||
hipDeviceptr_t dst = agent_address_for_symbol(static_cast<const char*>(symbolName));
|
||||
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst);
|
||||
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbol_name, dst);
|
||||
|
||||
if (dst == nullptr) {
|
||||
return ihipLogStatus(hipErrorInvalidSymbol);
|
||||
@@ -1003,21 +980,13 @@ hipError_t hipMemcpyToSymbol(const void* symbolName, const void* src, size_t cou
|
||||
return ihipLogStatus(hipSuccess);
|
||||
}
|
||||
|
||||
hipError_t hipMemcpyFromSymbol(void* dst, const void* src, size_t count,
|
||||
size_t offset, hipMemcpyKind kind,
|
||||
const char* symbol_name) {
|
||||
HIP_INIT_SPECIAL_API(hipMemcpyFromSymbol, (TRACE_MCMD), symbol_name, dst,
|
||||
count, offset, kind);
|
||||
|
||||
hipError_t hipMemcpyFromSymbol(void* dst, const void* symbolName, size_t count, size_t offset,
|
||||
hipMemcpyKind kind) {
|
||||
HIP_INIT_SPECIAL_API(hipMemcpyFromSymbol, (TRACE_MCMD), symbolName, dst, count, offset, kind);
|
||||
|
||||
if (symbolName == nullptr) {
|
||||
return ihipLogStatus(hipErrorInvalidSymbol);
|
||||
}
|
||||
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
|
||||
hc::accelerator acc = ctx->getDevice()->_acc;
|
||||
|
||||
hipDeviceptr_t src = agent_address_for_symbol(static_cast<const char*>(symbolName));
|
||||
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst);
|
||||
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbol_name, dst);
|
||||
|
||||
if (dst == nullptr) {
|
||||
return ihipLogStatus(hipErrorInvalidSymbol);
|
||||
@@ -1036,27 +1005,19 @@ hipError_t hipMemcpyFromSymbol(void* dst, const void* symbolName, size_t count,
|
||||
}
|
||||
|
||||
|
||||
hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void* src, size_t count,
|
||||
size_t offset, hipMemcpyKind kind, hipStream_t stream) {
|
||||
HIP_INIT_SPECIAL_API(hipMemcpyToSymbolAsync, (TRACE_MCMD), symbolName, src, count, offset, kind, stream);
|
||||
hipError_t hipMemcpyToSymbolAsync(void* dst, const void* src, size_t count,
|
||||
size_t offset, hipMemcpyKind kind,
|
||||
hipStream_t stream, const char* symbol_name) {
|
||||
HIP_INIT_SPECIAL_API(hipMemcpyToSymbolAsync, (TRACE_MCMD), symbol_name, src,
|
||||
count, offset, kind, stream);
|
||||
|
||||
if (symbolName == nullptr) {
|
||||
return ihipLogStatus(hipErrorInvalidSymbol);
|
||||
}
|
||||
|
||||
hipError_t e = hipSuccess;
|
||||
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
|
||||
hc::accelerator acc = ctx->getDevice()->_acc;
|
||||
|
||||
hipDeviceptr_t dst = agent_address_for_symbol(static_cast<const char*>(symbolName));
|
||||
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst);
|
||||
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbol_name, dst);
|
||||
|
||||
if (dst == nullptr) {
|
||||
return ihipLogStatus(hipErrorInvalidSymbol);
|
||||
}
|
||||
|
||||
hipError_t e = hipSuccess;
|
||||
if (stream) {
|
||||
try {
|
||||
hip_internal::memcpyAsync((char*)dst+offset, src, count, kind, stream);
|
||||
@@ -1070,28 +1031,19 @@ hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void* src, size_
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* src, size_t count,
|
||||
size_t offset, hipMemcpyKind kind,
|
||||
hipStream_t stream, const char* symbol_name) {
|
||||
HIP_INIT_SPECIAL_API(hipMemcpyFromSymbolAsync, (TRACE_MCMD), symbol_name,
|
||||
dst, count, offset, kind, stream);
|
||||
|
||||
hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName, size_t count, size_t offset,
|
||||
hipMemcpyKind kind, hipStream_t stream) {
|
||||
HIP_INIT_SPECIAL_API(hipMemcpyFromSymbolAsync, (TRACE_MCMD), symbolName, dst, count, offset, kind, stream);
|
||||
|
||||
if (symbolName == nullptr) {
|
||||
return ihipLogStatus(hipErrorInvalidSymbol);
|
||||
}
|
||||
|
||||
hipError_t e = hipSuccess;
|
||||
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
|
||||
hc::accelerator acc = ctx->getDevice()->_acc;
|
||||
|
||||
hipDeviceptr_t src = agent_address_for_symbol(static_cast<const char*>(symbolName));
|
||||
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, src);
|
||||
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbol_name, src);
|
||||
|
||||
if (src == nullptr || dst == nullptr) {
|
||||
return ihipLogStatus(hipErrorInvalidSymbol);
|
||||
}
|
||||
|
||||
hipError_t e = hipSuccess;
|
||||
stream = ihipSyncAndResolveStream(stream);
|
||||
if (stream) {
|
||||
try {
|
||||
@@ -1105,23 +1057,7 @@ hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName, size_t co
|
||||
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
|
||||
hipError_t hipGetSymbolAddress(void** devPtr, const void* symbolName) {
|
||||
HIP_INIT_API(hipGetSymbolAddress, devPtr, symbolName);
|
||||
|
||||
size_t size = 0;
|
||||
return ihipModuleGetGlobal(devPtr, &size, 0, static_cast<const char*>(symbolName));
|
||||
}
|
||||
|
||||
|
||||
hipError_t hipGetSymbolSize(size_t* size, const void* symbolName) {
|
||||
HIP_INIT_API(hipGetSymbolSize, size, symbolName);
|
||||
|
||||
void* devPtr = nullptr;
|
||||
return ihipModuleGetGlobal(&devPtr, size, 0, static_cast<const char*>(symbolName));
|
||||
}
|
||||
|
||||
} // Namespace hip_impl.
|
||||
|
||||
//---
|
||||
hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) {
|
||||
|
||||
+80
-144
@@ -20,11 +20,11 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "elfio/elfio.hpp"
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "hip/hcc_detail/elfio/elfio.hpp"
|
||||
#include "hip/hcc_detail/hsa_helpers.hpp"
|
||||
#include "hip/hcc_detail/program_state.hpp"
|
||||
#include "hip_hcc_internal.h"
|
||||
#include "hsa_helpers.hpp"
|
||||
#include "trace_helper.h"
|
||||
|
||||
#include <hsa/amd_hsa_kernel_code.h>
|
||||
@@ -52,7 +52,6 @@ THE SOFTWARE.
|
||||
// TODO Use Pool APIs from HCC to get memory regions.
|
||||
|
||||
using namespace ELFIO;
|
||||
using namespace hip_impl;
|
||||
using namespace std;
|
||||
|
||||
// calculate MD5 checksum
|
||||
@@ -268,13 +267,33 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
|
||||
localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent));
|
||||
}
|
||||
|
||||
namespace {
|
||||
struct Agent_global {
|
||||
string name;
|
||||
hipDeviceptr_t address;
|
||||
uint32_t byte_cnt;
|
||||
};
|
||||
namespace hip_impl {
|
||||
hsa_executable_t executable_for(hipModule_t hmod) {
|
||||
return hmod->executable;
|
||||
}
|
||||
|
||||
const std::string& hash_for(hipModule_t hmod) {
|
||||
return hmod->hash;
|
||||
}
|
||||
|
||||
hsa_agent_t this_agent() {
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
|
||||
if (!ctx) throw runtime_error{"No active HIP context."};
|
||||
|
||||
auto device = ctx->getDevice();
|
||||
|
||||
if (!device) throw runtime_error{"No device available for HIP."};
|
||||
|
||||
ihipDevice_t* currentDevice = ihipGetDevice(device->_deviceId);
|
||||
|
||||
if (!currentDevice) throw runtime_error{"No active device for HIP."};
|
||||
|
||||
return currentDevice->_hsaAgent;
|
||||
}
|
||||
} // Namespace hip_impl.
|
||||
|
||||
namespace {
|
||||
inline void track(const Agent_global& x, hsa_agent_t agent) {
|
||||
tprintf(DB_MEM, " add variable '%s' with ptr=%p size=%u to tracker\n", x.name.c_str(),
|
||||
x.address, x.byte_cnt);
|
||||
@@ -299,6 +318,8 @@ inline void track(const Agent_global& x, hsa_agent_t agent) {
|
||||
template <typename Container = vector<Agent_global>>
|
||||
inline hsa_status_t copy_agent_global_variables(hsa_executable_t, hsa_agent_t agent,
|
||||
hsa_executable_symbol_t x, void* out) {
|
||||
using namespace hip_impl;
|
||||
|
||||
assert(out);
|
||||
|
||||
hsa_symbol_kind_t t = {};
|
||||
@@ -313,90 +334,9 @@ inline hsa_status_t copy_agent_global_variables(hsa_executable_t, hsa_agent_t ag
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
inline hsa_agent_t this_agent() {
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
|
||||
if (!ctx) throw runtime_error{"No active HIP context."};
|
||||
|
||||
auto device = ctx->getDevice();
|
||||
|
||||
if (!device) throw runtime_error{"No device available for HIP."};
|
||||
|
||||
ihipDevice_t* currentDevice = ihipGetDevice(device->_deviceId);
|
||||
|
||||
if (!currentDevice) throw runtime_error{"No active device for HIP."};
|
||||
|
||||
return currentDevice->_hsaAgent;
|
||||
}
|
||||
|
||||
inline vector<Agent_global> read_agent_globals(hsa_agent_t agent, hsa_executable_t executable) {
|
||||
vector<Agent_global> r;
|
||||
|
||||
hsa_executable_iterate_agent_symbols(executable, agent, copy_agent_global_variables, &r);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
template <typename ForwardIterator>
|
||||
pair<hipDeviceptr_t, size_t> read_global_description(ForwardIterator f, ForwardIterator l,
|
||||
const char* name) {
|
||||
const auto it = std::find_if(f, l, [=](const Agent_global& x) { return x.name == name; });
|
||||
|
||||
return it == l ? make_pair(nullptr, 0u) : make_pair(it->address, it->byte_cnt);
|
||||
}
|
||||
|
||||
hipError_t read_agent_global_from_module(hipDeviceptr_t* dptr, size_t* bytes, hipModule_t hmod,
|
||||
const char* name) {
|
||||
// the key of the map would the hash of code object associated with the
|
||||
// hipModule_t instance
|
||||
static unordered_map<std::string, vector<Agent_global>> agent_globals;
|
||||
auto key = hmod->hash;
|
||||
|
||||
if (agent_globals.count(key) == 0) {
|
||||
static mutex mtx;
|
||||
lock_guard<mutex> lck{mtx};
|
||||
|
||||
if (agent_globals.count(key) == 0) {
|
||||
agent_globals.emplace(key, read_agent_globals(this_agent(), hmod->executable));
|
||||
}
|
||||
}
|
||||
|
||||
const auto it0 = agent_globals.find(key);
|
||||
if (it0 == agent_globals.cend()) {
|
||||
throw runtime_error{"agent_globals data structure corrupted."};
|
||||
}
|
||||
|
||||
tie(*dptr, *bytes) = read_global_description(it0->second.cbegin(), it0->second.cend(), name);
|
||||
|
||||
return *dptr ? hipSuccess : hipErrorNotFound;
|
||||
}
|
||||
|
||||
hipError_t read_agent_global_from_process(hipDeviceptr_t* dptr, size_t* bytes, const char* name) {
|
||||
static unordered_map<hsa_agent_t, vector<Agent_global>> agent_globals;
|
||||
static std::once_flag f;
|
||||
|
||||
call_once(f, []() {
|
||||
for (auto&& agent_executables : hip_impl::executables()) {
|
||||
vector<Agent_global> tmp0;
|
||||
for (auto&& executable : agent_executables.second) {
|
||||
auto tmp1 = read_agent_globals(agent_executables.first, executable);
|
||||
tmp0.insert(tmp0.end(), make_move_iterator(tmp1.begin()),
|
||||
make_move_iterator(tmp1.end()));
|
||||
}
|
||||
agent_globals.emplace(agent_executables.first, move(tmp0));
|
||||
}
|
||||
});
|
||||
|
||||
const auto it = agent_globals.find(this_agent());
|
||||
|
||||
if (it == agent_globals.cend()) return hipErrorNotInitialized;
|
||||
|
||||
tie(*dptr, *bytes) = read_global_description(it->second.cbegin(), it->second.cend(), name);
|
||||
|
||||
return *dptr ? hipSuccess : hipErrorNotFound;
|
||||
}
|
||||
|
||||
hsa_executable_symbol_t find_kernel_by_name(hsa_executable_t executable, const char* kname) {
|
||||
using namespace hip_impl;
|
||||
|
||||
pair<const char*, hsa_executable_symbol_t> r{kname, {}};
|
||||
|
||||
hsa_executable_iterate_agent_symbols(
|
||||
@@ -418,8 +358,8 @@ hsa_executable_symbol_t find_kernel_by_name(hsa_executable_t executable, const c
|
||||
return r.second;
|
||||
}
|
||||
|
||||
string read_elf_file_as_string(
|
||||
const void* file) { // Precondition: file points to an ELF image that was BITWISE loaded
|
||||
string read_elf_file_as_string(const void* file) {
|
||||
// Precondition: file points to an ELF image that was BITWISE loaded
|
||||
// into process accessible memory, and not one loaded by
|
||||
// the loader. This is because in the latter case
|
||||
// alignment may differ, which will break the size
|
||||
@@ -428,15 +368,18 @@ string read_elf_file_as_string(
|
||||
// Little Endian.
|
||||
if (!file) return {};
|
||||
|
||||
auto h = static_cast<const Elf64_Ehdr*>(file);
|
||||
auto h = static_cast<const ELFIO::Elf64_Ehdr*>(file);
|
||||
auto s = static_cast<const char*>(file);
|
||||
// This assumes the common case of SHT being the last part of the ELF.
|
||||
auto sz = sizeof(Elf64_Ehdr) + h->e_shoff + h->e_shentsize * h->e_shnum;
|
||||
auto sz =
|
||||
sizeof(ELFIO::Elf64_Ehdr) + h->e_shoff + h->e_shentsize * h->e_shnum;
|
||||
|
||||
return string{s, s + sz};
|
||||
}
|
||||
|
||||
string code_object_blob_for_agent(const void* maybe_bundled_code, hsa_agent_t agent) {
|
||||
using namespace hip_impl;
|
||||
|
||||
if (!maybe_bundled_code) return {};
|
||||
|
||||
Bundled_code_header tmp{maybe_bundled_code};
|
||||
@@ -454,9 +397,22 @@ string code_object_blob_for_agent(const void* maybe_bundled_code, hsa_agent_t ag
|
||||
|
||||
return string{it->blob.cbegin(), it->blob.cend()};
|
||||
}
|
||||
} // namespace
|
||||
} // Unnamed namespace.
|
||||
|
||||
namespace hip_impl {
|
||||
vector<Agent_global> read_agent_globals(hsa_agent_t agent,
|
||||
hsa_executable_t executable) {
|
||||
vector<Agent_global> r;
|
||||
|
||||
hsa_executable_iterate_agent_symbols(
|
||||
executable, agent, copy_agent_global_variables, &r);
|
||||
|
||||
return r;
|
||||
}
|
||||
} // Namespace hip_impl.
|
||||
|
||||
hipError_t ihipModuleGetFunction(hipFunction_t* func, hipModule_t hmod, const char* name) {
|
||||
using namespace hip_impl;
|
||||
|
||||
if (!func || !name) return hipErrorInvalidValue;
|
||||
|
||||
@@ -485,58 +441,36 @@ hipError_t hipModuleGetFunction(hipFunction_t* hfunc, hipModule_t hmod, const ch
|
||||
return ihipLogStatus(ihipModuleGetFunction(hfunc, hmod, name));
|
||||
}
|
||||
|
||||
hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes, hipModule_t hmod,
|
||||
const char* name) {
|
||||
HIP_INIT_API(hipModuleGetGlobal, dptr, bytes, hmod, name);
|
||||
namespace {
|
||||
hipFuncAttributes make_function_attributes(const amd_kernel_code_t& header) {
|
||||
hipFuncAttributes r{};
|
||||
|
||||
return ihipLogStatus(ihipModuleGetGlobal(dptr, bytes, hmod, name));
|
||||
}
|
||||
hipDeviceProp_t prop{};
|
||||
hipGetDeviceProperties(&prop, ihipGetTlsDefaultCtx()->getDevice()->_deviceId);
|
||||
// TODO: at the moment there is no way to query the count of registers
|
||||
// available per CU, therefore we hardcode it to 64 KiRegisters.
|
||||
prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024;
|
||||
|
||||
hipError_t ihipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes, hipModule_t hmod,
|
||||
const char* name) {
|
||||
if (!dptr || !bytes) return hipErrorInvalidValue;
|
||||
|
||||
if (!name) return hipErrorNotInitialized;
|
||||
|
||||
const auto r = hmod ? read_agent_global_from_module(dptr, bytes, hmod, name)
|
||||
: read_agent_global_from_process(dptr, bytes, name);
|
||||
r.localSizeBytes = header.workitem_private_segment_byte_size;
|
||||
r.sharedSizeBytes = header.workgroup_group_segment_byte_size;
|
||||
r.maxDynamicSharedSizeBytes = prop.sharedMemPerBlock - r.sharedSizeBytes;
|
||||
r.numRegs = header.workitem_vgpr_count;
|
||||
r.maxThreadsPerBlock = r.numRegs ?
|
||||
std::min(prop.maxThreadsPerBlock, prop.regsPerBlock / r.numRegs) :
|
||||
prop.maxThreadsPerBlock;
|
||||
r.binaryVersion =
|
||||
header.amd_machine_version_major * 10 +
|
||||
header.amd_machine_version_minor;
|
||||
r.ptxVersion = prop.major * 10 + prop.minor; // HIP currently presents itself as PTX 3.0.
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
inline
|
||||
hipFuncAttributes make_function_attributes(const amd_kernel_code_t& header)
|
||||
{
|
||||
hipFuncAttributes r{};
|
||||
|
||||
hipDeviceProp_t prop{};
|
||||
hipGetDeviceProperties(
|
||||
&prop, ihipGetTlsDefaultCtx()->getDevice()->_deviceId);
|
||||
// TODO: at the moment there is no way to query the count of registers
|
||||
// available per CU, therefore we hardcode it to 64 KiRegisters.
|
||||
prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024;
|
||||
|
||||
r.localSizeBytes = header.workitem_private_segment_byte_size;
|
||||
r.sharedSizeBytes = header.workgroup_group_segment_byte_size;
|
||||
r.maxDynamicSharedSizeBytes =
|
||||
prop.sharedMemPerBlock - r.sharedSizeBytes;
|
||||
r.numRegs = header.workitem_vgpr_count;
|
||||
r.maxThreadsPerBlock = r.numRegs ?
|
||||
std::min(prop.maxThreadsPerBlock, prop.regsPerBlock / r.numRegs) :
|
||||
prop.maxThreadsPerBlock;
|
||||
r.binaryVersion =
|
||||
header.amd_machine_version_major * 10 +
|
||||
header.amd_machine_version_minor;
|
||||
r.ptxVersion = prop.major * 10 + prop.minor; // HIP currently presents itself as PTX 3.0.
|
||||
|
||||
return r;
|
||||
}
|
||||
}
|
||||
} // Unnamed namespace.
|
||||
|
||||
hipError_t hipFuncGetAttributes(hipFuncAttributes* attr, const void* func)
|
||||
{
|
||||
using namespace hip_impl;
|
||||
|
||||
if (!attr) return hipErrorInvalidValue;
|
||||
if (!func) return hipErrorInvalidDeviceFunction;
|
||||
|
||||
@@ -564,6 +498,7 @@ hipError_t hipFuncGetAttributes(hipFuncAttributes* attr, const void* func)
|
||||
}
|
||||
|
||||
hipError_t ihipModuleLoadData(hipModule_t* module, const void* image) {
|
||||
using namespace hip_impl;
|
||||
|
||||
if (!module) return hipErrorInvalidValue;
|
||||
|
||||
@@ -585,9 +520,8 @@ hipError_t ihipModuleLoadData(hipModule_t* module, const void* image) {
|
||||
|
||||
auto content = tmp.empty() ? read_elf_file_as_string(image) : tmp;
|
||||
|
||||
(*module)->executable = hip_impl::load_executable(content,
|
||||
(*module)->executable,
|
||||
this_agent());
|
||||
(*module)->executable = load_executable(content, (*module)->executable,
|
||||
this_agent());
|
||||
|
||||
// compute the hash of the code object
|
||||
(*module)->hash = checksum(content.length(), content.data());
|
||||
@@ -621,6 +555,8 @@ hipError_t hipModuleLoadDataEx(hipModule_t* module, const void* image, unsigned
|
||||
}
|
||||
|
||||
hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const char* name) {
|
||||
using namespace hip_impl;
|
||||
|
||||
HIP_INIT_API(hipModuleGetTexRef, texRef, hmod, name);
|
||||
|
||||
hipError_t ret = hipErrorNotFound;
|
||||
|
||||
@@ -1,659 +0,0 @@
|
||||
#include "../include/hip/hcc_detail/program_state.hpp"
|
||||
|
||||
#include "../include/hip/hcc_detail/code_object_bundle.hpp"
|
||||
|
||||
#include "hip_hcc_internal.h"
|
||||
#include "hsa_helpers.hpp"
|
||||
#include "trace_helper.h"
|
||||
|
||||
#include "elfio/elfio.hpp"
|
||||
|
||||
#include <link.h>
|
||||
|
||||
#include <hsa/hsa.h>
|
||||
#include <hsa/hsa_ext_amd.h>
|
||||
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
using namespace ELFIO;
|
||||
using namespace hip_impl;
|
||||
using namespace std;
|
||||
|
||||
namespace {
|
||||
struct Symbol {
|
||||
string name;
|
||||
ELFIO::Elf64_Addr value = 0;
|
||||
Elf_Xword size = 0;
|
||||
Elf_Half sect_idx = 0;
|
||||
uint8_t bind = 0;
|
||||
uint8_t type = 0;
|
||||
uint8_t other = 0;
|
||||
};
|
||||
|
||||
inline Symbol read_symbol(const symbol_section_accessor& section, unsigned int idx) {
|
||||
assert(idx < section.get_symbols_num());
|
||||
|
||||
Symbol r;
|
||||
section.get_symbol(idx, r.name, r.value, r.size, r.bind, r.type, r.sect_idx, r.other);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
template <typename P>
|
||||
inline section* find_section_if(elfio& reader, P p) {
|
||||
const auto it = find_if(reader.sections.begin(), reader.sections.end(), move(p));
|
||||
|
||||
return it != reader.sections.end() ? *it : nullptr;
|
||||
}
|
||||
|
||||
vector<string> copy_names_of_undefined_symbols(const symbol_section_accessor& section) {
|
||||
vector<string> r;
|
||||
|
||||
for (auto i = 0u; i != section.get_symbols_num(); ++i) {
|
||||
// TODO: this is boyscout code, caching the temporaries
|
||||
// may be of worth.
|
||||
|
||||
auto tmp = read_symbol(section, i);
|
||||
if (tmp.sect_idx == SHN_UNDEF && !tmp.name.empty()) {
|
||||
r.push_back(std::move(tmp.name));
|
||||
}
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
const std::unordered_map<std::string, std::pair<ELFIO::Elf64_Addr, ELFIO::Elf_Xword>>&
|
||||
symbol_addresses(bool rebuild = false) {
|
||||
static unordered_map<string, pair<Elf64_Addr, Elf_Xword>> r;
|
||||
static once_flag f;
|
||||
|
||||
auto cons = [rebuild]() {
|
||||
if (rebuild) {
|
||||
r.clear();
|
||||
}
|
||||
|
||||
dl_iterate_phdr(
|
||||
[](dl_phdr_info* info, size_t, void*) {
|
||||
static constexpr const char self[] = "/proc/self/exe";
|
||||
elfio reader;
|
||||
|
||||
static unsigned int iter = 0u;
|
||||
if (reader.load(!iter ? self : info->dlpi_name)) {
|
||||
auto it = find_section_if(
|
||||
reader, [](const class section* x) { return x->get_type() == SHT_SYMTAB; });
|
||||
|
||||
if (it) {
|
||||
const symbol_section_accessor symtab{reader, it};
|
||||
|
||||
for (auto i = 0u; i != symtab.get_symbols_num(); ++i) {
|
||||
auto tmp = read_symbol(symtab, i);
|
||||
|
||||
if (tmp.type == STT_OBJECT && tmp.sect_idx != SHN_UNDEF) {
|
||||
const auto addr = tmp.value + (iter ? info->dlpi_addr : 0);
|
||||
r.emplace(move(tmp.name), make_pair(addr, tmp.size));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
++iter;
|
||||
}
|
||||
|
||||
return 0;
|
||||
},
|
||||
nullptr);
|
||||
};
|
||||
|
||||
call_once(f, cons);
|
||||
if (rebuild) {
|
||||
cons();
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
void associate_code_object_symbols_with_host_allocation(const elfio& reader,
|
||||
section* code_object_dynsym,
|
||||
hsa_agent_t agent,
|
||||
hsa_executable_t executable) {
|
||||
if (!code_object_dynsym) return;
|
||||
|
||||
const auto undefined_symbols =
|
||||
copy_names_of_undefined_symbols(symbol_section_accessor{reader, code_object_dynsym});
|
||||
|
||||
for (auto&& x : undefined_symbols) {
|
||||
if (globals().find(x) != globals().cend()) return;
|
||||
|
||||
const auto it1 = symbol_addresses().find(x);
|
||||
|
||||
if (it1 == symbol_addresses().cend()) {
|
||||
throw runtime_error{"Global symbol: " + x + " is undefined."};
|
||||
}
|
||||
|
||||
static mutex mtx;
|
||||
lock_guard<mutex> lck{mtx};
|
||||
|
||||
if (globals().find(x) != globals().cend()) return;
|
||||
globals().emplace(x, (void*)(it1->second.first));
|
||||
void* p = nullptr;
|
||||
hsa_amd_memory_lock(reinterpret_cast<void*>(it1->second.first), it1->second.second,
|
||||
nullptr, // All agents.
|
||||
0, &p);
|
||||
|
||||
hsa_executable_agent_global_variable_define(executable, agent, x.c_str(), p);
|
||||
}
|
||||
}
|
||||
|
||||
vector<char> code_object_blob_for_process() {
|
||||
static constexpr const char self[] = "/proc/self/exe";
|
||||
static constexpr const char kernel_section[] = ".kernel";
|
||||
|
||||
elfio reader;
|
||||
|
||||
if (!reader.load(self)) {
|
||||
throw runtime_error{"Failed to load ELF file for current process."};
|
||||
}
|
||||
|
||||
auto kernels =
|
||||
find_section_if(reader, [](const section* x) { return x->get_name() == kernel_section; });
|
||||
|
||||
vector<char> r;
|
||||
if (kernels) {
|
||||
r.insert(r.end(), kernels->get_data(), kernels->get_data() + kernels->get_size());
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
const unordered_map<hsa_isa_t, vector<vector<char>>>& code_object_blobs(bool rebuild = false) {
|
||||
static unordered_map<hsa_isa_t, vector<vector<char>>> r;
|
||||
static once_flag f;
|
||||
|
||||
auto cons = [rebuild]() {
|
||||
// names of shared libraries who .kernel sections already loaded
|
||||
static unordered_set<string> lib_names;
|
||||
static vector<vector<char>> blobs{code_object_blob_for_process()};
|
||||
|
||||
if (rebuild) {
|
||||
r.clear();
|
||||
blobs.clear();
|
||||
}
|
||||
|
||||
dl_iterate_phdr(
|
||||
[](dl_phdr_info* info, std::size_t, void*) {
|
||||
elfio tmp;
|
||||
if ((lib_names.find(info->dlpi_name) == lib_names.end()) &&
|
||||
(tmp.load(info->dlpi_name))) {
|
||||
const auto it = find_section_if(
|
||||
tmp, [](const section* x) { return x->get_name() == ".kernel"; });
|
||||
|
||||
if (it) {
|
||||
blobs.emplace_back(
|
||||
it->get_data(), it->get_data() + it->get_size());
|
||||
// register the shared library as already loaded
|
||||
lib_names.emplace(info->dlpi_name);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
},
|
||||
nullptr);
|
||||
|
||||
for (auto&& blob : blobs) {
|
||||
for (auto sub_blob = blob.begin(); sub_blob != blob.end(); ) {
|
||||
Bundled_code_header tmp(sub_blob, blob.end());
|
||||
if (valid(tmp)) {
|
||||
for (auto&& bundle : bundles(tmp)) {
|
||||
r[triple_to_hsa_isa(bundle.triple)].push_back(bundle.blob);
|
||||
}
|
||||
sub_blob+=tmp.bundled_code_size;
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
call_once(f, cons);
|
||||
if (rebuild) {
|
||||
cons();
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
vector<pair<uintptr_t, string>> function_names_for(const elfio& reader, section* symtab) {
|
||||
vector<pair<uintptr_t, string>> r;
|
||||
symbol_section_accessor symbols{reader, symtab};
|
||||
|
||||
for (auto i = 0u; i != symbols.get_symbols_num(); ++i) {
|
||||
// TODO: this is boyscout code, caching the temporaries
|
||||
// may be of worth.
|
||||
auto tmp = read_symbol(symbols, i);
|
||||
|
||||
if (tmp.type == STT_FUNC && tmp.sect_idx != SHN_UNDEF && !tmp.name.empty()) {
|
||||
r.emplace_back(tmp.value, tmp.name);
|
||||
}
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
const vector<pair<uintptr_t, string>>& function_names_for_process(bool rebuild = false) {
|
||||
static constexpr const char self[] = "/proc/self/exe";
|
||||
|
||||
static vector<pair<uintptr_t, string>> r;
|
||||
static once_flag f;
|
||||
|
||||
auto cons = [rebuild]() {
|
||||
elfio reader;
|
||||
|
||||
if (!reader.load(self)) {
|
||||
throw runtime_error{"Failed to load the ELF file for the current process."};
|
||||
}
|
||||
|
||||
auto symtab =
|
||||
find_section_if(reader, [](const section* x) { return x->get_type() == SHT_SYMTAB; });
|
||||
|
||||
if (symtab) r = function_names_for(reader, symtab);
|
||||
};
|
||||
|
||||
call_once(f, cons);
|
||||
if (rebuild) {
|
||||
cons();
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
const unordered_map<string, vector<hsa_executable_symbol_t>>& kernels(bool rebuild = false) {
|
||||
static unordered_map<string, vector<hsa_executable_symbol_t>> r;
|
||||
static once_flag f;
|
||||
|
||||
auto cons = [rebuild]() {
|
||||
if (rebuild) {
|
||||
r.clear();
|
||||
executables(rebuild);
|
||||
}
|
||||
|
||||
static const auto copy_kernels = [](hsa_executable_t, hsa_agent_t,
|
||||
hsa_executable_symbol_t s, void*) {
|
||||
if (type(s) == HSA_SYMBOL_KIND_KERNEL) r[name(s)].push_back(s);
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
};
|
||||
|
||||
for (auto&& agent_executables : executables()) {
|
||||
for (auto&& executable : agent_executables.second) {
|
||||
hsa_executable_iterate_agent_symbols(executable, agent_executables.first,
|
||||
copy_kernels, nullptr);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
call_once(f, cons);
|
||||
if (rebuild) {
|
||||
cons();
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
void load_code_object_and_freeze_executable(
|
||||
const string& file, hsa_agent_t agent,
|
||||
hsa_executable_t
|
||||
executable) { // TODO: the following sequence is inefficient, should be refactored
|
||||
// into a single load of the file and subsequent ELFIO
|
||||
// processing.
|
||||
static const auto cor_deleter = [](hsa_code_object_reader_t* p) {
|
||||
if (p) {
|
||||
hsa_code_object_reader_destroy(*p);
|
||||
delete p;
|
||||
}
|
||||
};
|
||||
|
||||
using RAII_code_reader = unique_ptr<hsa_code_object_reader_t, decltype(cor_deleter)>;
|
||||
|
||||
if (!file.empty()) {
|
||||
RAII_code_reader tmp{new hsa_code_object_reader_t, cor_deleter};
|
||||
hsa_code_object_reader_create_from_memory(file.data(), file.size(), tmp.get());
|
||||
|
||||
hsa_executable_load_agent_code_object(executable, agent, *tmp, nullptr, nullptr);
|
||||
|
||||
hsa_executable_freeze(executable, nullptr);
|
||||
|
||||
static vector<RAII_code_reader> code_readers;
|
||||
static mutex mtx;
|
||||
|
||||
lock_guard<mutex> lck{mtx};
|
||||
code_readers.push_back(move(tmp));
|
||||
}
|
||||
}
|
||||
|
||||
size_t parse_args(
|
||||
const string& metadata,
|
||||
size_t f,
|
||||
size_t l,
|
||||
vector<pair<size_t, size_t>>& size_align) {
|
||||
if (f == l) return f;
|
||||
if (!size_align.empty()) return l;
|
||||
|
||||
do {
|
||||
static constexpr size_t size_sz{5};
|
||||
f = metadata.find("Size:", f) + size_sz;
|
||||
|
||||
if (l <= f) return f;
|
||||
|
||||
auto size = strtoul(&metadata[f], nullptr, 10);
|
||||
|
||||
static constexpr size_t align_sz{6};
|
||||
f = metadata.find("Align:", f) + align_sz;
|
||||
|
||||
char* l{};
|
||||
auto align = strtoul(&metadata[f], &l, 10);
|
||||
|
||||
f += (l - &metadata[f]) + 1;
|
||||
|
||||
size_align.emplace_back(size, align);
|
||||
} while (true);
|
||||
}
|
||||
|
||||
void read_kernarg_metadata(
|
||||
elfio& reader,
|
||||
unordered_map<string, vector<pair<size_t, size_t>>>& kernargs)
|
||||
{ // TODO: this is inefficient.
|
||||
auto it = find_section_if(
|
||||
reader, [](const section* x) { return x->get_type() == SHT_NOTE; });
|
||||
|
||||
if (!it) return;
|
||||
|
||||
const note_section_accessor acc{reader, it};
|
||||
for (decltype(acc.get_notes_num()) i = 0; i != acc.get_notes_num(); ++i) {
|
||||
ELFIO::Elf_Word type{};
|
||||
string name{};
|
||||
void* desc{};
|
||||
Elf_Word desc_size{};
|
||||
|
||||
acc.get_note(i, type, name, desc, desc_size);
|
||||
|
||||
if (name != "AMD") continue; // TODO: switch to using NT_AMD_AMDGPU_HSA_METADATA.
|
||||
|
||||
string tmp{
|
||||
static_cast<char*>(desc), static_cast<char*>(desc) + desc_size};
|
||||
|
||||
auto dx = tmp.find("Kernels:");
|
||||
|
||||
if (dx == string::npos) continue;
|
||||
|
||||
static constexpr decltype(tmp.size()) kernels_sz{8};
|
||||
dx += kernels_sz;
|
||||
|
||||
do {
|
||||
dx = tmp.find("Name:", dx);
|
||||
|
||||
if (dx == string::npos) break;
|
||||
|
||||
static constexpr decltype(tmp.size()) name_sz{5};
|
||||
dx = tmp.find_first_not_of(" '", dx + name_sz);
|
||||
|
||||
auto fn = tmp.substr(dx, tmp.find_first_of("'\n", dx) - dx);
|
||||
dx += fn.size();
|
||||
|
||||
auto dx1 = tmp.find("CodeProps", dx);
|
||||
dx = tmp.find("Args:", dx);
|
||||
|
||||
if (dx1 < dx) {
|
||||
dx = dx1;
|
||||
continue;
|
||||
}
|
||||
if (dx == string::npos) break;
|
||||
|
||||
static constexpr decltype(tmp.size()) args_sz{5};
|
||||
dx = parse_args(tmp, dx + args_sz, dx1, kernargs[fn]);
|
||||
} while (true);
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace hip_impl {
|
||||
const unordered_map<hsa_agent_t, vector<hsa_executable_t>>&
|
||||
executables(bool rebuild) { // TODO: This leaks the hsa_executable_ts, it should use RAII.
|
||||
static unordered_map<hsa_agent_t, vector<hsa_executable_t>> r;
|
||||
static once_flag f;
|
||||
|
||||
auto cons = [rebuild]() {
|
||||
static const auto accelerators = hc::accelerator::get_all();
|
||||
|
||||
if (rebuild) {
|
||||
// do NOT clear r so we reuse instances of hsa_executable_t
|
||||
// created previously
|
||||
code_object_blobs(rebuild);
|
||||
}
|
||||
|
||||
for (auto&& acc : accelerators) {
|
||||
auto agent = static_cast<hsa_agent_t*>(acc.get_hsa_agent());
|
||||
|
||||
if (!agent || !acc.is_hsa_accelerator()) continue;
|
||||
|
||||
hsa_agent_iterate_isas(*agent,
|
||||
[](hsa_isa_t x, void* pa) {
|
||||
const auto it = code_object_blobs().find(x);
|
||||
|
||||
if (it != code_object_blobs().cend()) {
|
||||
hsa_agent_t a = *static_cast<hsa_agent_t*>(pa);
|
||||
|
||||
for (auto&& blob : it->second) {
|
||||
hsa_executable_t tmp = {};
|
||||
|
||||
hsa_executable_create_alt(
|
||||
HSA_PROFILE_FULL,
|
||||
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, nullptr,
|
||||
&tmp);
|
||||
|
||||
// TODO: this is massively inefficient and only
|
||||
// meant for illustration.
|
||||
string blob_to_str{blob.cbegin(), blob.cend()};
|
||||
tmp = load_executable(blob_to_str, tmp, a);
|
||||
|
||||
if (tmp.handle) r[a].push_back(tmp);
|
||||
}
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
},
|
||||
agent);
|
||||
}
|
||||
};
|
||||
|
||||
call_once(f, cons);
|
||||
if (rebuild) {
|
||||
cons();
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
const unordered_map<uintptr_t, string>& function_names(bool rebuild) {
|
||||
static unordered_map<uintptr_t, string> r{function_names_for_process().cbegin(),
|
||||
function_names_for_process().cend()};
|
||||
static once_flag f;
|
||||
|
||||
auto cons = [rebuild]() {
|
||||
if (rebuild) {
|
||||
r.clear();
|
||||
function_names_for_process(rebuild);
|
||||
r.insert(function_names_for_process().cbegin(),
|
||||
function_names_for_process().cend());
|
||||
}
|
||||
|
||||
dl_iterate_phdr(
|
||||
[](dl_phdr_info* info, size_t, void*) {
|
||||
elfio tmp;
|
||||
if (tmp.load(info->dlpi_name)) {
|
||||
const auto it = find_section_if(
|
||||
tmp, [](const section* x) { return x->get_type() == SHT_SYMTAB; });
|
||||
|
||||
if (it) {
|
||||
auto n = function_names_for(tmp, it);
|
||||
|
||||
for (auto&& f : n) f.first += info->dlpi_addr;
|
||||
|
||||
r.insert(make_move_iterator(n.begin()), make_move_iterator(n.end()));
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
},
|
||||
nullptr);
|
||||
};
|
||||
|
||||
call_once(f, cons);
|
||||
if (rebuild) {
|
||||
static mutex mtx;
|
||||
lock_guard<mutex> lck{mtx};
|
||||
cons();
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
const unordered_map<uintptr_t, vector<pair<hsa_agent_t, Kernel_descriptor>>>& functions(bool rebuild) {
|
||||
static unordered_map<uintptr_t, vector<pair<hsa_agent_t, Kernel_descriptor>>> r;
|
||||
static once_flag f;
|
||||
|
||||
auto cons = [rebuild]() {
|
||||
if (rebuild) {
|
||||
// do NOT clear r so we reuse instances of pair<hsa_agent_t, Kernel_descriptor>
|
||||
// created previously
|
||||
|
||||
function_names(rebuild);
|
||||
kernargs(rebuild);
|
||||
kernels(rebuild);
|
||||
globals(rebuild);
|
||||
}
|
||||
|
||||
for (auto&& function : function_names()) {
|
||||
const auto it = kernels().find(function.second);
|
||||
|
||||
if (it != kernels().cend()) {
|
||||
for (auto&& kernel_symbol : it->second) {
|
||||
r[function.first].emplace_back(
|
||||
agent(kernel_symbol),
|
||||
Kernel_descriptor{kernel_object(kernel_symbol), it->first});
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
call_once(f, cons);
|
||||
if (rebuild) {
|
||||
static mutex mtx;
|
||||
lock_guard<mutex> lck{mtx};
|
||||
cons();
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
unordered_map<string, void*>& globals(bool rebuild) {
|
||||
static unordered_map<string, void*> r;
|
||||
static once_flag f;
|
||||
auto cons =[rebuild]() {
|
||||
if (rebuild) {
|
||||
r.clear();
|
||||
symbol_addresses(rebuild);
|
||||
}
|
||||
|
||||
r.reserve(symbol_addresses().size());
|
||||
};
|
||||
|
||||
call_once(f, cons);
|
||||
if (rebuild) {
|
||||
cons();
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
const unordered_map<string, vector<pair<size_t, size_t>>>& kernargs(
|
||||
bool rebuild) {
|
||||
static unordered_map<string, vector<pair<size_t, size_t>>> r;
|
||||
static once_flag f;
|
||||
|
||||
static const auto build_map = [](decltype(r)& x) {
|
||||
for (auto&& isa_blobs : code_object_blobs()) {
|
||||
for (auto&& blob : isa_blobs.second) {
|
||||
stringstream tmp{std::string{blob.cbegin(), blob.cend()}};
|
||||
|
||||
elfio reader;
|
||||
if (!reader.load(tmp)) continue;
|
||||
|
||||
read_kernarg_metadata(reader, x);
|
||||
}
|
||||
}
|
||||
};
|
||||
call_once(f, []() { r.reserve(function_names().size()); build_map(r); });
|
||||
|
||||
if (rebuild) {
|
||||
static mutex mtx;
|
||||
thread_local static decltype(r) tmp;
|
||||
|
||||
{
|
||||
lock_guard<mutex> lck{mtx};
|
||||
|
||||
tmp.insert(r.cbegin(), r.cend()); // Should use merge in C++17.
|
||||
}
|
||||
|
||||
build_map(tmp);
|
||||
|
||||
lock_guard<mutex> lck{mtx};
|
||||
|
||||
r.insert(tmp.cbegin(), tmp.cend());
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
hsa_executable_t load_executable(const string& file, hsa_executable_t executable,
|
||||
hsa_agent_t agent) {
|
||||
elfio reader;
|
||||
stringstream tmp{file};
|
||||
|
||||
if (!reader.load(tmp)) return hsa_executable_t{};
|
||||
|
||||
const auto code_object_dynsym = find_section_if(
|
||||
reader, [](const ELFIO::section* x) { return x->get_type() == SHT_DYNSYM; });
|
||||
|
||||
associate_code_object_symbols_with_host_allocation(reader, code_object_dynsym, agent,
|
||||
executable);
|
||||
|
||||
load_code_object_and_freeze_executable(file, agent, executable);
|
||||
|
||||
return executable;
|
||||
}
|
||||
|
||||
// HIP startup kernel loader logic
|
||||
// When enabled HIP_STARTUP_LOADER, HIP will load the kernels and setup
|
||||
// the function symbol map on program startup
|
||||
extern "C" void __attribute__((constructor)) __startup_kernel_loader_init() {
|
||||
int hip_startup_loader=0;
|
||||
if (std::getenv("HIP_STARTUP_LOADER"))
|
||||
hip_startup_loader = atoi(std::getenv("HIP_STARTUP_LOADER"));
|
||||
if (hip_startup_loader) functions(true);
|
||||
}
|
||||
|
||||
extern "C" void __attribute__((destructor)) __startup_kernel_loader_fini() {
|
||||
}
|
||||
|
||||
} // Namespace hip_impl.
|
||||
Ссылка в новой задаче
Block a user