From d0252dfa79cdc1127dda008aa386702fb6fecf16 Mon Sep 17 00:00:00 2001 From: Siu Chi Chan Date: Sun, 12 May 2019 09:54:03 -0400 Subject: [PATCH] migrate program_state logic from header into shared library (phase I) (#1077) * Revert "Revert "Use COMgr to read Kernel Args Metadata (#1006)"" This reverts commit 62e96cb4cf36f43aa191eb92a3e096908eff41c9. * Revert "Use COMgr to read Kernel Args Metadata (#1006)" This reverts commit 882006555bcd1ce9dc7ead3ea827c13ceb596736. * Revert "improve program state commentary" This reverts commit fb2beb0c88265f372549450a3f0b30c6e08b03d2. * Revert "load program state once per agent" This reverts commit 21f5e142f5d7c6f6de86966f4399e0578848b2de. * start moving function_names() into the hip shared lib * start moving code_object_blobs to a new "state" object * Consolidate various program state related static objects into a single program_state object * minor clean up * move more stuffs from functional_grid_launch into program_state * debug make_kernarg * moving lookup for kernargs size_align into program_state * clean up old code for kernarg size and alignment * update hip_module to use newer api in program_state * Create public member functions for program_state * move most program state functions into shared library * Pass the data buffer size to load_executable Otherwise, it can't figure what the data size is just from the char* (since the data is not really a string) * turning free functions in program state into members of program_state_impl * change the free function globals() into a member of program_state_impl * replace the static mutex used for populating globals * moving associate_code_object_symbols_with_host_allocation into program_state_impl * move load_code_object_and_freeze_executable into program_state_impl * moving executables and functions_names into program_state_impl * moving kernels() into program_state_impl * moving functions() into program_state_impl * move get_kernargs into program_state_impl * moving kernel_descriptor into program_state_impl * moving kernargs_size_align calculation into program_state_impl * Changing the handle to program_state_impl to a pointer * moving program_state_impl into a separate inline source file * fixing/cleaning up some header file includes * moving member function for kernargs_size_align into program_state.cpp * moving Kernel_descriptor into program_state.inl * add a new class to manage agent globals * moving all agent globals processing functions into agent_globals_impl * load program state once per agent re-merging PR991 against other program state changes * fix per-agent program state member initialization * cache executables based on elf name, isa, and agent. This avoids program state reloading executables after a shared library is dlopened. re-merging PR1057 against other program state changes * protect executables cache by a global mutex * return ref to executables cache * adapt PR#981 Make hipModuleGetGlobal be in HIP runtime [ROCm/hip commit: f5eb91d53dd548affce17527ce350a59240b6a1c] --- projects/hip/CMakeLists.txt | 1 + .../hip/hcc_detail/functional_grid_launch.hpp | 65 +- .../include/hip/hcc_detail/hip_runtime_api.h | 191 +++-- .../include/hip/hcc_detail/program_state.hpp | 625 +-------------- projects/hip/src/hip_clang.cpp | 4 +- projects/hip/src/hip_hcc.cpp | 10 + projects/hip/src/hip_module.cpp | 21 +- projects/hip/src/program_state.cpp | 63 ++ projects/hip/src/program_state.inl | 713 ++++++++++++++++++ 9 files changed, 961 insertions(+), 732 deletions(-) create mode 100644 projects/hip/src/program_state.cpp create mode 100644 projects/hip/src/program_state.inl diff --git a/projects/hip/CMakeLists.txt b/projects/hip/CMakeLists.txt index 78d2bc7a30..30aa071515 100644 --- a/projects/hip/CMakeLists.txt +++ b/projects/hip/CMakeLists.txt @@ -232,6 +232,7 @@ if(HIP_PLATFORM STREQUAL "hcc") set(SOURCE_FILES_RUNTIME src/code_object_bundle.cpp + src/program_state.cpp src/hip_clang.cpp src/hip_hcc.cpp src/hip_context.cpp diff --git a/projects/hip/include/hip/hcc_detail/functional_grid_launch.hpp b/projects/hip/include/hip/hcc_detail/functional_grid_launch.hpp index 42326cbdc1..8082188489 100644 --- a/projects/hip/include/hip/hcc_detail/functional_grid_launch.hpp +++ b/projects/hip/include/hip/hcc_detail/functional_grid_launch.hpp @@ -59,7 +59,7 @@ template < typename std::enable_if::type* = nullptr> inline std::vector make_kernarg( const std::tuple&, - const std::vector>&, + const kernargs_size_align&, std::vector kernarg) { return kernarg; } @@ -70,7 +70,7 @@ template < typename std::enable_if::type* = nullptr> inline std::vector make_kernarg( const std::tuple& formals, - const std::vector>& size_align, + const kernargs_size_align& size_align, std::vector kernarg) { using T = typename std::tuple_element>::type; @@ -86,13 +86,12 @@ inline std::vector make_kernarg( #endif kernarg.resize(round_up_to_next_multiple_nonnegative( - kernarg.size(), size_align[n].second) + size_align[n].first); + kernarg.size(), size_align.alignment(n)) + size_align.size(n)); std::memcpy( - kernarg.data() + kernarg.size() - size_align[n].first, + kernarg.data() + kernarg.size() - size_align.size(n), &std::get(formals), - size_align[n].first); - + size_align.size(n)); return make_kernarg(formals, size_align, std::move(kernarg)); } @@ -104,45 +103,17 @@ inline std::vector make_kernarg( if (sizeof...(Formals) == 0) return {}; - auto it = function_names().find(reinterpret_cast(kernel)); - if (it == function_names().cend()) { - hip_throw(std::runtime_error{"Undefined __global__ function."}); - } - - auto it1 = kernargs().find(it->second); - if (it1 == kernargs().end()) { - hip_throw(std::runtime_error{ - "Missing metadata for __global__ function: " + it->second}); - } - std::tuple to_formals{std::move(actuals)}; std::vector kernarg; kernarg.reserve(sizeof(to_formals)); - return make_kernarg<0>(to_formals, it1->second, std::move(kernarg)); + auto& ps = hip_impl::get_program_state(); + return make_kernarg<0>(to_formals, + ps.get_kernargs_size_align( + reinterpret_cast(kernel)), + std::move(kernarg)); } -inline -std::string name(std::uintptr_t function_address) -{ - const auto it = function_names().find(function_address); - - if (it == function_names().cend()) { - hip_throw(std::runtime_error{ - "Invalid function passed to hipLaunchKernelGGL."}); - } - - return it->second; -} - -inline -std::string name(hsa_agent_t agent) -{ - char n[64]{}; - hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, n); - - return std::string{n}; -} hsa_agent_t target_agent(hipStream_t stream); @@ -156,17 +127,10 @@ void hipLaunchKernelGGLImpl( hipStream_t stream, void** kernarg) { - auto agent = target_agent(stream); - auto it = functions(agent).find(function_address); + const auto& kd = hip_impl::get_program_state().kernel_descriptor(function_address, + target_agent(stream)); - if (it == functions(agent).cend()) { - hip_throw(std::runtime_error{ - "No device code available for function: " + - name(function_address) + - ", for agent: " + name(agent)}); - } - - hipModuleLaunchKernel(it->second, numBlocks.x, numBlocks.y, numBlocks.z, + hipModuleLaunchKernel(kd, numBlocks.x, numBlocks.y, numBlocks.z, dimBlocks.x, dimBlocks.y, dimBlocks.z, sharedMemBytes, stream, nullptr, kernarg); } @@ -178,8 +142,7 @@ void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks, std::uint32_t sharedMemBytes, hipStream_t stream, Args... args) { hip_impl::hip_init(); - auto kernarg = hip_impl::make_kernarg( - kernel, std::tuple{std::move(args)...}); + auto kernarg = hip_impl::make_kernarg(kernel, std::tuple{std::move(args)...}); std::size_t kernarg_size = kernarg.size(); void* config[]{ diff --git a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h index b6eb21e0ef..d9da632577 100644 --- a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h +++ b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h @@ -67,6 +67,7 @@ THE SOFTWARE. #define HIP_LAUNCH_PARAM_END ((void*)0x03) #ifdef __cplusplus + #include #include #include #include @@ -2643,90 +2644,129 @@ std::vector read_agent_globals(hsa_agent_t agent, hsa_executable_t executable); hsa_agent_t this_agent(); + +class agent_globals_impl { +private: + std::pair< + std::mutex, + std::unordered_map< + std::string, std::vector>> globals_from_module; + + std::unordered_map< + hsa_agent_t, + std::pair< + std::once_flag, + std::vector>> globals_from_process; + +public: + + hipError_t read_agent_global_from_module(hipDeviceptr_t* dptr, size_t* bytes, + hipModule_t hmod, const char* name) { + // the key of the map would the hash of code object associated with the + // hipModule_t instance + std::string key(hash_for(hmod)); + + if (globals_from_module.second.count(key) == 0) { + std::lock_guard lck{globals_from_module.first}; + + if (globals_from_module.second.count(key) == 0) { + globals_from_module.second.emplace( + key, read_agent_globals(this_agent(), executable_for(hmod))); + } + } + + const auto it0 = globals_from_module.second.find(key); + if (it0 == globals_from_module.second.cend()) { + hip_throw( + std::runtime_error{"agent_globals data structure corrupted."}); + } + + std::tie(*dptr, *bytes) = read_global_description(it0->second.cbegin(), + it0->second.cend(), name); + + // HACK for SWDEV-173477 + // + // For code objects with global symbols of length 0, ROCR runtime would + // ignore them even though they exist in the symbol table. Therefore the + // result from read_agent_globals() can't be trusted entirely. + // + // As a workaround to tame applications which depend on the existence of + // global symbols with length 0, always return hipSuccess here. + // + // This behavior shall be reverted once ROCR runtime has been fixed to + // address SWDEV-173477 + + //return *dptr ? hipSuccess : hipErrorNotFound; + return hipSuccess; + } + + hipError_t read_agent_global_from_process(hipDeviceptr_t* dptr, size_t* bytes, + const char* name) { + + auto agent = this_agent(); + + std::call_once(globals_from_process[agent].first, [this](hsa_agent_t aa) { + std::vector tmp0; + for (auto&& executable : hip_impl::get_program_state().executables(aa)) { + auto tmp1 = read_agent_globals(aa, executable); + tmp0.insert(tmp0.end(), make_move_iterator(tmp1.begin()), + make_move_iterator(tmp1.end())); + } + globals_from_process[aa].second = move(move(tmp0)); + }, agent); + + const auto it = globals_from_process.find(agent); + + if (it == globals_from_process.cend()) return hipErrorNotInitialized; + + std::tie(*dptr, *bytes) = read_global_description(it->second.second.cbegin(), + it->second.second.cend(), name); + + return *dptr ? hipSuccess : hipErrorNotFound; + } + +}; + +class agent_globals { +public: + agent_globals() : impl(new agent_globals_impl()) { + if (!impl) + hip_throw( + std::runtime_error{"Error when constructing agent global data structures."}); + } + ~agent_globals() { delete impl; } + + hipError_t read_agent_global_from_module(hipDeviceptr_t* dptr, size_t* bytes, + hipModule_t hmod, const char* name) { + return impl->read_agent_global_from_module(dptr, bytes, hmod, name); + } + + hipError_t read_agent_global_from_process(hipDeviceptr_t* dptr, size_t* bytes, + const char* name) { + return impl->read_agent_global_from_process(dptr, bytes, name); + } + +private: + agent_globals_impl* impl; +}; + inline __attribute__((visibility("hidden"))) -hipError_t read_agent_global_from_module(hipDeviceptr_t* dptr, size_t* bytes, - hipModule_t hmod, const char* name) { - // the key of the map would the hash of code object associated with the - // hipModule_t instance - static std::unordered_map< - std::string, std::vector> agent_globals; - std::string key(hash_for(hmod)); - - if (agent_globals.count(key) == 0) { - static std::mutex mtx; - std::lock_guard lck{mtx}; - - if (agent_globals.count(key) == 0) { - agent_globals.emplace( - key, read_agent_globals(this_agent(), executable_for(hmod))); - } - } - - const auto it0 = agent_globals.find(key); - if (it0 == agent_globals.cend()) { - hip_throw( - std::runtime_error{"agent_globals data structure corrupted."}); - } - - std::tie(*dptr, *bytes) = read_global_description(it0->second.cbegin(), - it0->second.cend(), name); - - // HACK for SWDEV-173477 - // - // For code objects with global symbols of length 0, ROCR runtime would - // ignore them even though they exist in the symbol table. Therefore the - // result from read_agent_globals() can't be trusted entirely. - // - // As a workaround to tame applications which depend on the existence of - // global symbols with length 0, always return hipSuccess here. - // - // This behavior shall be reverted once ROCR runtime has been fixed to - // address SWDEV-173477 - - //return *dptr ? hipSuccess : hipErrorNotFound; - return hipSuccess; +agent_globals& get_agent_globals() { + static agent_globals ag; + return ag; } + +extern "C" inline __attribute__((visibility("hidden"))) hipError_t read_agent_global_from_process(hipDeviceptr_t* dptr, size_t* bytes, const char* name) { - static std::unordered_map>> globals; - static std::once_flag f; - auto agent = this_agent(); - - // Create placeholder for each agent in the map. - std::call_once(f, []() { - for (auto&& x : hip_impl::all_hsa_agents()) { - (void)globals[x]; - } - }); - - if (globals.find(agent) == globals.cend()) { - hip_throw(std::runtime_error{"invalid agent"}); - } - - std::call_once(globals[agent].first, [](hsa_agent_t aa) { - std::vector tmp0; - for (auto&& executable : executables(aa)) { - auto tmp1 = read_agent_globals(aa, executable); - tmp0.insert(tmp0.end(), make_move_iterator(tmp1.begin()), - make_move_iterator(tmp1.end())); - } - globals[aa].second = move(tmp0); - }, agent); - - const auto it = globals.find(agent); - - if (it == globals.cend()) return hipErrorNotInitialized; - - std::tie(*dptr, *bytes) = read_global_description(it->second.second.cbegin(), - it->second.second.cend(), name); - - return *dptr ? hipSuccess : hipErrorNotFound; + return get_agent_globals().read_agent_global_from_process(dptr, bytes, name); } + + } // Namespace hip_impl. #if defined(__cplusplus) @@ -2748,6 +2788,7 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes, #endif // __HIP_VDI__ hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const char* name); + /** * @brief builds module from code object which resides in host memory. Image is pointer to that * location. diff --git a/projects/hip/include/hip/hcc_detail/program_state.hpp b/projects/hip/include/hip/hcc_detail/program_state.hpp index f05f41d3f5..ca8b6dcd88 100644 --- a/projects/hip/include/hip/hcc_detail/program_state.hpp +++ b/projects/hip/include/hip/hcc_detail/program_state.hpp @@ -22,38 +22,17 @@ THE SOFTWARE. #pragma once -#include "code_object_bundle.hpp" -#include "hsa_helpers.hpp" - -#if !defined(__cpp_exceptions) - #define try if (true) - #define catch(...) if (false) -#endif -#include "elfio/elfio.hpp" -#if !defined(__cpp_exceptions) - #undef try - #undef catch -#endif - #include #include #include #include -#include - -#include #include #include #include -#include -#include -#include -#include #include #include #include -#include #include struct ihipModuleSymbol_t; @@ -84,590 +63,48 @@ inline constexpr bool operator==(hsa_isa_t x, hsa_isa_t y) { namespace hip_impl { -std::vector all_hsa_agents(); - -class Kernel_descriptor { - std::uint64_t kernel_object_{}; - amd_kernel_code_t const* kernel_header_{nullptr}; - std::string name_{}; -public: - Kernel_descriptor() = default; - Kernel_descriptor(std::uint64_t kernel_object, const std::string& name) - : kernel_object_{kernel_object}, name_{name} - { - bool supported{false}; - std::uint16_t min_v{UINT16_MAX}; - auto r = hsa_system_major_extension_supported( - HSA_EXTENSION_AMD_LOADER, 1, &min_v, &supported); - - if (r != HSA_STATUS_SUCCESS || !supported) return; - - hsa_ven_amd_loader_1_01_pfn_t tbl{}; - - r = hsa_system_get_major_extension_table( - HSA_EXTENSION_AMD_LOADER, - 1, - sizeof(tbl), - reinterpret_cast(&tbl)); - - if (r != HSA_STATUS_SUCCESS) return; - if (!tbl.hsa_ven_amd_loader_query_host_address) return; - - r = tbl.hsa_ven_amd_loader_query_host_address( - reinterpret_cast(kernel_object_), - reinterpret_cast(&kernel_header_)); - - if (r != HSA_STATUS_SUCCESS) return; - } - Kernel_descriptor(const Kernel_descriptor&) = default; - Kernel_descriptor(Kernel_descriptor&&) = default; - ~Kernel_descriptor() = default; - - Kernel_descriptor& operator=(const Kernel_descriptor&) = default; - Kernel_descriptor& operator=(Kernel_descriptor&&) = default; - - operator hipFunction_t() const { // TODO: this is awful and only meant for illustration. - return reinterpret_cast(const_cast(this)); - } -}; - -template -inline -ELFIO::section* find_section_if(ELFIO::elfio& reader, P p) { - const auto it = std::find_if( - reader.sections.begin(), reader.sections.end(), std::move(p)); - - return it != reader.sections.end() ? *it : nullptr; -} - -inline -__attribute__((visibility("hidden"))) -const std::unordered_map< - hsa_isa_t, std::vector>>& code_object_blobs() { - static std::unordered_map>> r; - static std::once_flag f; - - std::call_once(f, []() { - static std::vector> blobs{}; - - dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void*) { - ELFIO::elfio tmp; - - const auto elf = - info->dlpi_addr ? info->dlpi_name : "/proc/self/exe"; - - if (!tmp.load(elf)) return 0; - - const auto it = find_section_if(tmp, [](const ELFIO::section* x) { - return x->get_name() == ".kernel"; - }); - - if (!it) return 0; - - blobs.emplace_back(it->get_data(), it->get_data() + it->get_size()); - - return 0; - }, nullptr); - - for (auto&& multi_arch_blob : blobs) { - auto it = multi_arch_blob.begin(); - while (it != multi_arch_blob.end()) { - Bundled_code_header tmp{it, multi_arch_blob.end()}; - - if (!valid(tmp)) break; - - for (auto&& bundle : bundles(tmp)) { - r[triple_to_hsa_isa(bundle.triple)].push_back(bundle.blob); - } - - it += tmp.bundled_code_size; - }; - } - }); - - return r; -} - -struct Symbol { - std::string name; - ELFIO::Elf64_Addr value = 0; - ELFIO::Elf_Xword size = 0; - ELFIO::Elf_Half sect_idx = 0; - std::uint8_t bind = 0; - std::uint8_t type = 0; - std::uint8_t other = 0; -}; - -inline -Symbol read_symbol(const ELFIO::symbol_section_accessor& section, - unsigned int idx) { - assert(idx < section.get_symbols_num()); - - Symbol r; - section.get_symbol( - idx, r.name, r.value, r.size, r.bind, r.type, r.sect_idx, r.other); - - return r; -} - -inline -__attribute__((visibility("hidden"))) -const std::unordered_map< - std::string, - std::pair>& symbol_addresses() { - static std::unordered_map< - std::string, std::pair> r; - static std::once_flag f; - - std::call_once(f, []() { - dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void*) { - ELFIO::elfio tmp; - const auto elf = - info->dlpi_addr ? info->dlpi_name : "/proc/self/exe"; - - if (!tmp.load(elf)) return 0; - - auto it = find_section_if(tmp, [](const ELFIO::section* x) { - return x->get_type() == SHT_SYMTAB; - }); - - if (!it) return 0; - - const ELFIO::symbol_section_accessor symtab{tmp, it}; - - for (auto i = 0u; i != symtab.get_symbols_num(); ++i) { - auto s = read_symbol(symtab, i); - - if (s.type != STT_OBJECT || s.sect_idx == SHN_UNDEF) continue; - - const auto addr = s.value + info->dlpi_addr; - r.emplace(std::move(s.name), std::make_pair(addr, s.size)); - } - - return 0; - }, nullptr); - }); - - return r; -} - -inline -__attribute__((visibility("hidden"))) -std::unordered_map& globals() { - static std::unordered_map r; - static std::once_flag f; - - std::call_once(f, []() { r.reserve(symbol_addresses().size()); }); - - return r; -} - -inline -std::vector copy_names_of_undefined_symbols( - const ELFIO::symbol_section_accessor& section) { - std::vector r; - - for (auto i = 0u; i != section.get_symbols_num(); ++i) { - // TODO: this is boyscout code, caching the temporaries - // may be of worth. - auto tmp = read_symbol(section, i); - if (tmp.sect_idx != SHN_UNDEF || tmp.name.empty()) continue; - - r.push_back(std::move(tmp.name)); - } - - return r; -} - [[noreturn]] void hip_throw(const std::exception&); -inline -void associate_code_object_symbols_with_host_allocation( - const ELFIO::elfio& reader, - ELFIO::section* code_object_dynsym, - hsa_agent_t agent, - hsa_executable_t executable) { - if (!code_object_dynsym) return; +class kernargs_size_align; +class program_state_impl; +class program_state { +public: + program_state(); + ~program_state(); - const auto undefined_symbols = copy_names_of_undefined_symbols( - ELFIO::symbol_section_accessor{reader, code_object_dynsym}); + hipFunction_t kernel_descriptor(std::uintptr_t, + hsa_agent_t); + + kernargs_size_align get_kernargs_size_align(std::uintptr_t); + hsa_executable_t load_executable(const char*, const size_t, + hsa_executable_t, + hsa_agent_t); - for (auto&& x : undefined_symbols) { - if (globals().find(x) != globals().cend()) return; + void* global_addr_by_name(const char* name); - const auto it1 = symbol_addresses().find(x); + // to fix later + const std::vector& executables(hsa_agent_t agent); - if (it1 == symbol_addresses().cend()) { - hip_throw(std::runtime_error{ - "Global symbol: " + x + " is undefined."}); - } + program_state(const program_state&) = delete; - static std::mutex mtx; - std::lock_guard lck{mtx}; +private: + program_state_impl* impl; +}; - if (globals().find(x) != globals().cend()) return; - - globals().emplace(x, (void*)(it1->second.first)); - void* p = nullptr; - hsa_amd_memory_lock( - reinterpret_cast(it1->second.first), - it1->second.second, - nullptr, // All agents. - 0, - &p); - - hsa_executable_agent_global_variable_define( - executable, agent, x.c_str(), p); - } -} - -inline -void load_code_object_and_freeze_executable( - const std::string& file, hsa_agent_t agent, hsa_executable_t executable) { - // TODO: the following sequence is inefficient, should be refactored - // into a single load of the file and subsequent ELFIO - // processing. - static const auto cor_deleter = [](hsa_code_object_reader_t* p) { - if (!p) return; - - hsa_code_object_reader_destroy(*p); - delete p; - }; - - using RAII_code_reader = - std::unique_ptr; - - if (file.empty()) return; - - RAII_code_reader tmp{new hsa_code_object_reader_t, cor_deleter}; - hsa_code_object_reader_create_from_memory( - file.data(), file.size(), tmp.get()); - - hsa_executable_load_agent_code_object( - executable, agent, *tmp, nullptr, nullptr); - - hsa_executable_freeze(executable, nullptr); - - static std::vector code_readers; - static std::mutex mtx; - - std::lock_guard lck{mtx}; - code_readers.push_back(move(tmp)); -} - -inline -hsa_executable_t load_executable(const std::string& file, - hsa_executable_t executable, - hsa_agent_t agent) { - ELFIO::elfio reader; - std::stringstream tmp{file}; - - if (!reader.load(tmp)) return hsa_executable_t{}; - - const auto code_object_dynsym = find_section_if( - reader, [](const ELFIO::section* x) { - return x->get_type() == SHT_DYNSYM; - }); - - associate_code_object_symbols_with_host_allocation(reader, - code_object_dynsym, - agent, executable); - - load_code_object_and_freeze_executable(file, agent, executable); - - return executable; -} +class kernargs_size_align { +public: + std::size_t size(std::size_t n) const; + std::size_t alignment(std::size_t n) const; +private: + const void* handle; + friend kernargs_size_align program_state::get_kernargs_size_align(std::uintptr_t); +}; inline __attribute__((visibility("hidden"))) -const std::vector& executables(hsa_agent_t agent) { - static std::unordered_map>> r; - static std::once_flag f; - - // Create placeholder for each agent in the map. - std::call_once(f, []() { - for (auto&& x : hip_impl::all_hsa_agents()) { - (void)r[x]; - } - }); - - if (r.find(agent) == r.cend()) { - hip_throw(std::runtime_error{"invalid agent"}); - } - - std::call_once(r[agent].first, [](hsa_agent_t aa) { - hsa_agent_iterate_isas(aa, [](hsa_isa_t x, void* pa) { - const auto it = code_object_blobs().find(x); - - if (it == code_object_blobs().cend()) return HSA_STATUS_SUCCESS; - - hsa_agent_t a = *static_cast(pa); - - for (auto&& blob : it->second) { - hsa_executable_t tmp = {}; - - hsa_executable_create_alt( - HSA_PROFILE_FULL, - HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, - nullptr, - &tmp); - - // TODO: this is massively inefficient and only meant for - // illustration. - std::string blob_to_str{blob.cbegin(), blob.cend()}; - tmp = load_executable(blob_to_str, tmp, a); - - if (tmp.handle) r[a].second.push_back(tmp); - } - - return HSA_STATUS_SUCCESS; - }, &aa); - }, agent); - - return r[agent].second; -} - -inline -std::vector> function_names_for( - const ELFIO::elfio& reader, ELFIO::section* symtab) { - std::vector> r; - ELFIO::symbol_section_accessor symbols{reader, symtab}; - - for (auto i = 0u; i != symbols.get_symbols_num(); ++i) { - // TODO: this is boyscout code, caching the temporaries - // may be of worth. - auto tmp = read_symbol(symbols, i); - - if (tmp.type != STT_FUNC) continue; - if (tmp.type == SHN_UNDEF) continue; - if (tmp.name.empty()) continue; - - r.emplace_back(tmp.value, tmp.name); - } - - return r; -} - -inline -__attribute__((visibility("hidden"))) -const std::unordered_map& function_names() { - static std::unordered_map r; - static std::once_flag f; - - std::call_once(f, []() { - dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void*) { - ELFIO::elfio tmp; - const auto elf = - info->dlpi_addr ? info->dlpi_name : "/proc/self/exe"; - - if (!tmp.load(elf)) return 0; - - const auto it = find_section_if(tmp, [](const ELFIO::section* x) { - return x->get_type() == SHT_SYMTAB; - }); - - if (!it) return 0; - - auto names = function_names_for(tmp, it); - for (auto&& x : names) x.first += info->dlpi_addr; - - r.insert( - std::make_move_iterator(names.begin()), - std::make_move_iterator(names.end())); - - return 0; - }, nullptr); - }); - - return r; -} - -inline -__attribute__((visibility("hidden"))) -const std::unordered_map< - std::string, std::vector>& kernels(hsa_agent_t agent) { - static std::unordered_map>>> r; - static std::once_flag f; - - // Create placeholder for each agent in the map. - std::call_once(f, []() { - for (auto&& x : hip_impl::all_hsa_agents()) { - (void)r[x]; - } - }); - - if (r.find(agent) == r.cend()) { - hip_throw(std::runtime_error{"invalid agent"}); - } - - std::call_once(r[agent].first, [](hsa_agent_t aa) { - static const auto copy_kernels = []( - hsa_executable_t, hsa_agent_t a, hsa_executable_symbol_t x, void*) { - if (type(x) == HSA_SYMBOL_KIND_KERNEL) r[a].second[name(x)].push_back(x); - - return HSA_STATUS_SUCCESS; - }; - - for (auto&& executable : executables(aa)) { - hsa_executable_iterate_agent_symbols( - executable, aa, copy_kernels, nullptr); - } - }, agent); - - return r[agent].second; -} - -inline -__attribute__((visibility("hidden"))) -const std::unordered_map< - std::uintptr_t, - Kernel_descriptor>& functions(hsa_agent_t agent) { - static std::unordered_map>> r; - static std::once_flag f; - - // Create placeholder for each agent in the map. - std::call_once(f, []() { - for (auto&& x : hip_impl::all_hsa_agents()) { - (void)r[x]; - } - }); - - if (r.find(agent) == r.cend()) { - hip_throw(std::runtime_error{"invalid agent"}); - } - - std::call_once(r[agent].first, [](hsa_agent_t aa) { - for (auto&& function : function_names()) { - const auto it = kernels(aa).find(function.second); - - if (it == kernels(aa).cend()) continue; - - for (auto&& kernel_symbol : it->second) { - r[aa].second.emplace( - function.first, - Kernel_descriptor{kernel_object(kernel_symbol), it->first}); - } - } - }, agent); - - return r[agent].second; -} - -inline -std::size_t parse_args( - const std::string& metadata, - std::size_t f, - std::size_t l, - std::vector>& size_align) { - if (f == l) return f; - if (!size_align.empty()) return l; - - do { - static constexpr size_t size_sz{5}; - f = metadata.find("Size:", f) + size_sz; - - if (l <= f) return f; - - auto size = std::strtoul(&metadata[f], nullptr, 10); - - static constexpr size_t align_sz{6}; - f = metadata.find("Align:", f) + align_sz; - - char* l{}; - auto align = std::strtoul(&metadata[f], &l, 10); - - f += (l - &metadata[f]) + 1; - - size_align.emplace_back(size, align); - } while (true); -} - -inline -void read_kernarg_metadata( - ELFIO::elfio& reader, - std::unordered_map< - std::string, - std::vector>>& kernargs) { - // TODO: this is inefficient. - auto it = find_section_if(reader, [](const ELFIO::section* x) { - return x->get_type() == SHT_NOTE; - }); - - if (!it) return; - - const ELFIO::note_section_accessor acc{reader, it}; - for (decltype(acc.get_notes_num()) i = 0; i != acc.get_notes_num(); ++i) { - ELFIO::Elf_Word type{}; - std::string name{}; - void* desc{}; - ELFIO::Elf_Word desc_size{}; - - acc.get_note(i, type, name, desc, desc_size); - - if (name != "AMD") continue; // TODO: switch to using NT_AMD_AMDGPU_HSA_METADATA. - - std::string tmp{ - static_cast(desc), static_cast(desc) + desc_size}; - - auto dx = tmp.find("Kernels:"); - - if (dx == std::string::npos) continue; - - static constexpr decltype(tmp.size()) kernels_sz{8}; - dx += kernels_sz; - - do { - dx = tmp.find("Name:", dx); - - if (dx == std::string::npos) break; - - static constexpr decltype(tmp.size()) name_sz{5}; - dx = tmp.find_first_not_of(" '", dx + name_sz); - - auto fn = tmp.substr(dx, tmp.find_first_of("'\n", dx) - dx); - dx += fn.size(); - - auto dx1 = tmp.find("CodeProps", dx); - dx = tmp.find("Args:", dx); - - if (dx1 < dx) { - dx = dx1; - continue; - } - if (dx == std::string::npos) break; - - static constexpr decltype(tmp.size()) args_sz{5}; - dx = parse_args(tmp, dx + args_sz, dx1, kernargs[fn]); - } while (true); - } -} - -inline -__attribute__((visibility("hidden"))) -const std::unordered_map< - std::string, std::vector>>& kernargs() { - static std::unordered_map< - std::string, std::vector>> r; - static std::once_flag f; - - std::call_once(f, []() { - for (auto&& isa_blobs : code_object_blobs()) { - for (auto&& blob : isa_blobs.second) { - std::stringstream tmp{std::string{blob.cbegin(), blob.cend()}}; - - ELFIO::elfio reader; - - if (!reader.load(tmp)) continue; - - read_kernarg_metadata(reader, r); - } - } - }); - - return r; +program_state& get_program_state() { + static program_state ps; + return ps; } } // Namespace hip_impl. diff --git a/projects/hip/src/hip_clang.cpp b/projects/hip/src/hip_clang.cpp index 82e181ca69..b8beb05400 100644 --- a/projects/hip/src/hip_clang.cpp +++ b/projects/hip/src/hip_clang.cpp @@ -85,7 +85,9 @@ __hipRegisterFatBinary(const void* data) reinterpret_cast(header) + desc->offset), desc->size}; if (HIP_DUMP_CODE_OBJECT) __hipDumpCodeObject(image); - module->executable = hip_impl::load_executable(image, module->executable, agent); + module->executable = hip_impl::get_program_state().load_executable(image.data(), image.size(), + module->executable, + agent); if (module->executable.handle) { modules->at(deviceId) = module; diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 8011160cc1..2af83b65a8 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -2489,4 +2489,14 @@ namespace hip_impl { std::terminate(); #endif } + + std::mutex executables_cache_mutex; + + std::vector& executables_cache( + std::string elf, hsa_isa_t isa, hsa_agent_t agent) { + static std::unordered_map>>> cache; + return cache[elf][isa][agent]; + } } // Namespace hip_impl. diff --git a/projects/hip/src/hip_module.cpp b/projects/hip/src/hip_module.cpp index e1a3b1ff65..e029ec6e78 100644 --- a/projects/hip/src/hip_module.cpp +++ b/projects/hip/src/hip_module.cpp @@ -25,6 +25,7 @@ THE SOFTWARE. #include "hip/hcc_detail/hsa_helpers.hpp" #include "hip/hcc_detail/program_state.hpp" #include "hip_hcc_internal.h" +#include "program_state.inl" #include "trace_helper.h" #include @@ -289,7 +290,7 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes, if (!name) return hipErrorNotInitialized; - return hip_impl::read_agent_global_from_module(dptr, bytes, hmod, name); + return hip_impl::get_agent_globals().read_agent_global_from_module(dptr, bytes, hmod, name); } namespace hip_impl { @@ -512,11 +513,8 @@ hipError_t hipFuncGetAttributes(hipFuncAttributes* attr, const void* func) if (!func) return hipErrorInvalidDeviceFunction; auto agent = this_agent(); - const auto it = functions(agent).find(reinterpret_cast(func)); - - if (it == functions(agent).cend()) return hipErrorInvalidDeviceFunction; - - const auto header = static_cast(it->second)->_header; + auto kd = get_program_state().kernel_descriptor(reinterpret_cast(func), agent); + const auto header = kd->_header; if (!header) throw runtime_error{"Ill-formed Kernel_descriptor."}; @@ -548,7 +546,8 @@ hipError_t ihipModuleLoadData(hipModule_t* module, const void* image) { auto content = tmp.empty() ? read_elf_file_as_string(image) : tmp; - (*module)->executable = load_executable(content, (*module)->executable, + (*module)->executable = get_program_state().load_executable( + content.data(), content.size(), (*module)->executable, this_agent()); // compute the hash of the code object @@ -591,10 +590,10 @@ hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const if (!texRef) return ihipLogStatus(hipErrorInvalidValue); if (!hmod || !name) return ihipLogStatus(hipErrorNotInitialized); + + auto addr = get_program_state().global_addr_by_name(name); + if (addr == nullptr) return ihipLogStatus(hipErrorInvalidValue); - const auto it = globals().find(name); - if (it == globals().end()) return ihipLogStatus(hipErrorInvalidValue); - - *texRef = reinterpret_cast(it->second); + *texRef = reinterpret_cast(addr); return ihipLogStatus(hipSuccess); } diff --git a/projects/hip/src/program_state.cpp b/projects/hip/src/program_state.cpp new file mode 100644 index 0000000000..6783f85b9b --- /dev/null +++ b/projects/hip/src/program_state.cpp @@ -0,0 +1,63 @@ +#include "../include/hip/hcc_detail/program_state.hpp" + +#include + +#include +#include +#include +#include + +// contains implementation of program_state_impl +#include "program_state.inl" + +namespace hip_impl { + +std::size_t kernargs_size_align::kernargs_size_align::size(std::size_t n) const{ + return (*reinterpret_cast>*>(handle))[n].first; +} + +std::size_t kernargs_size_align::alignment(std::size_t n) const{ + return (*reinterpret_cast>*>(handle))[n].second; +} + +program_state::program_state() : + impl(new program_state_impl) { + if (!impl) hip_throw(std::runtime_error { + "Unknown error when constructing program state."}); +} + +program_state::~program_state() { + delete(impl); +} + +void* program_state::global_addr_by_name(const char* name) { + const auto it = impl->get_globals().find(name); + if (it == impl->get_globals().end()) + return nullptr; + else + return it->second; +} + +hsa_executable_t program_state::load_executable(const char* data, + const size_t data_size, + hsa_executable_t executable, + hsa_agent_t agent) { + return impl->load_executable(data, data_size, executable, agent); +} + +const std::vector& program_state::executables(hsa_agent_t agent) { + return impl->get_executables(agent); +} + +hipFunction_t program_state::kernel_descriptor(std::uintptr_t function_address, + hsa_agent_t agent) { + auto& kd = impl->kernel_descriptor(function_address, agent); + return kd; +} + +kernargs_size_align program_state::get_kernargs_size_align(std::uintptr_t kernel) { + kernargs_size_align t; + t.handle = reinterpret_cast(&impl->kernargs_size_align(kernel)); + return t; +} +}; diff --git a/projects/hip/src/program_state.inl b/projects/hip/src/program_state.inl new file mode 100644 index 0000000000..9729da8115 --- /dev/null +++ b/projects/hip/src/program_state.inl @@ -0,0 +1,713 @@ +#include "../include/hip/hcc_detail/program_state.hpp" + +#include "../include/hip/hcc_detail/code_object_bundle.hpp" +#include "../include/hip/hcc_detail/hsa_helpers.hpp" + +#if !defined(__cpp_exceptions) + #define try if (true) + #define catch(...) if (false) +#endif +#include "../include/hip/hcc_detail/elfio/elfio.hpp" +#if !defined(__cpp_exceptions) + #undef try + #undef catch +#endif + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace hip_impl { + +[[noreturn]] +void hip_throw(const std::exception&); + +std::vector all_hsa_agents(); + +extern std::mutex executables_cache_mutex; + +std::vector& executables_cache(std::string, hsa_isa_t, hsa_agent_t); + +template +inline +ELFIO::section* find_section_if(ELFIO::elfio& reader, P p) { + const auto it = std::find_if( + reader.sections.begin(), reader.sections.end(), std::move(p)); + + return it != reader.sections.end() ? *it : nullptr; +} + +struct Symbol { + std::string name; + ELFIO::Elf64_Addr value = 0; + ELFIO::Elf_Xword size = 0; + ELFIO::Elf_Half sect_idx = 0; + std::uint8_t bind = 0; + std::uint8_t type = 0; + std::uint8_t other = 0; +}; + +class Kernel_descriptor { + std::uint64_t kernel_object_{}; + amd_kernel_code_t const* kernel_header_{nullptr}; + std::string name_{}; +public: + Kernel_descriptor() = default; + Kernel_descriptor(std::uint64_t kernel_object, const std::string& name) + : kernel_object_{kernel_object}, name_{name} + { + bool supported{false}; + std::uint16_t min_v{UINT16_MAX}; + auto r = hsa_system_major_extension_supported( + HSA_EXTENSION_AMD_LOADER, 1, &min_v, &supported); + + if (r != HSA_STATUS_SUCCESS || !supported) return; + + hsa_ven_amd_loader_1_01_pfn_t tbl{}; + + r = hsa_system_get_major_extension_table( + HSA_EXTENSION_AMD_LOADER, + 1, + sizeof(tbl), + reinterpret_cast(&tbl)); + + if (r != HSA_STATUS_SUCCESS) return; + if (!tbl.hsa_ven_amd_loader_query_host_address) return; + + r = tbl.hsa_ven_amd_loader_query_host_address( + reinterpret_cast(kernel_object_), + reinterpret_cast(&kernel_header_)); + + if (r != HSA_STATUS_SUCCESS) return; + } + Kernel_descriptor(const Kernel_descriptor&) = default; + Kernel_descriptor(Kernel_descriptor&&) = default; + ~Kernel_descriptor() = default; + + Kernel_descriptor& operator=(const Kernel_descriptor&) = default; + Kernel_descriptor& operator=(Kernel_descriptor&&) = default; + + operator hipFunction_t() const { // TODO: this is awful and only meant for illustration. + return reinterpret_cast(const_cast(this)); + } +}; + +class program_state_impl { + +public: + + std::pair< + std::once_flag, + std::unordered_map< + std::string, + std::unordered_map< + hsa_isa_t, + std::vector>>>> code_object_blobs; + + std::pair< + std::once_flag, + std::unordered_map< + std::string, + std::pair>> symbol_addresses; + + std::unordered_map< + hsa_agent_t, + std::pair< + std::once_flag, + std::vector>> executables; + + std::unordered_map< + hsa_agent_t, + std::pair< + std::once_flag, + std::unordered_map< + std::string, + std::vector>>> kernels; + + std::pair< + std::once_flag, + std::unordered_map< + std::string, std::vector>>> kernargs; + + std::pair< + std::once_flag, + std::unordered_map> function_names; + + std::unordered_map< + hsa_agent_t, + std::pair< + std::once_flag, + std::unordered_map< + std::uintptr_t, + Kernel_descriptor>>> functions; + + std::tuple< + std::once_flag, + std::mutex, + std::unordered_map> globals; + + using RAII_code_reader = + std::unique_ptr>; + std::pair< + std::mutex, + std::vector> code_readers; + + program_state_impl() { + // Create placeholder for each agent for the per-agent members. + for (auto&& x : hip_impl::all_hsa_agents()) { + (void)executables[x]; + (void)kernels[x]; + (void)functions[x]; + } + } + + const std::unordered_map< + std::string, + std::unordered_map< + hsa_isa_t, + std::vector>>>& get_code_object_blobs() { + + std::call_once(code_object_blobs.first, [this]() { + dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void* p) { + ELFIO::elfio tmp; + + const auto elf = + info->dlpi_addr ? info->dlpi_name : "/proc/self/exe"; + + if (!tmp.load(elf)) return 0; + + const auto it = find_section_if(tmp, [](const ELFIO::section* x) { + return x->get_name() == ".kernel"; + }); + + if (!it) return 0; + + auto& impl = *static_cast(p); + + std::vector multi_arch_blob(it->get_data(), it->get_data() + it->get_size()); + auto blob_it = multi_arch_blob.begin(); + while (blob_it != multi_arch_blob.end()) { + Bundled_code_header tmp{blob_it, multi_arch_blob.end()}; + + if (!valid(tmp)) break; + + for (auto&& bundle : bundles(tmp)) { + impl.code_object_blobs.second[elf][triple_to_hsa_isa(bundle.triple)].push_back(bundle.blob); + } + + blob_it += tmp.bundled_code_size; + }; + + return 0; + }, this); + }); + + return code_object_blobs.second; + } + + Symbol read_symbol(const ELFIO::symbol_section_accessor& section, + unsigned int idx) { + assert(idx < section.get_symbols_num()); + + Symbol r; + section.get_symbol( + idx, r.name, r.value, r.size, r.bind, r.type, r.sect_idx, r.other); + + return r; + } + + const std::unordered_map< + std::string, + std::pair>& get_symbol_addresses() { + + std::call_once(symbol_addresses.first, [this]() { + dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void* psi_ptr) { + + if (!psi_ptr) + return 0; + + program_state_impl* t = static_cast(psi_ptr); + + ELFIO::elfio tmp; + const auto elf = + info->dlpi_addr ? info->dlpi_name : "/proc/self/exe"; + + if (!tmp.load(elf)) return 0; + + auto it = find_section_if(tmp, [](const ELFIO::section* x) { + return x->get_type() == SHT_SYMTAB; + }); + + if (!it) return 0; + + const ELFIO::symbol_section_accessor symtab{tmp, it}; + + for (auto i = 0u; i != symtab.get_symbols_num(); ++i) { + auto s = t->read_symbol(symtab, i); + + if (s.type != STT_OBJECT || s.sect_idx == SHN_UNDEF) continue; + + const auto addr = s.value + info->dlpi_addr; + t->symbol_addresses.second.emplace(std::move(s.name), std::make_pair(addr, s.size)); + } + + return 0; + }, this); + }); + + return symbol_addresses.second; + } + + std::unordered_map& get_globals() { + std::call_once(std::get<0>(globals), [this]() { + std::get<2>(globals).reserve(get_symbol_addresses().size()); + }); + return std::get<2>(globals); + } + + std::mutex& get_globals_mutex() { + return std::get<1>(globals); + } + + std::vector copy_names_of_undefined_symbols( + const ELFIO::symbol_section_accessor& section) { + std::vector r; + + for (auto i = 0u; i != section.get_symbols_num(); ++i) { + // TODO: this is boyscout code, caching the temporaries + // may be of worth. + auto tmp = read_symbol(section, i); + if (tmp.sect_idx != SHN_UNDEF || tmp.name.empty()) continue; + + r.push_back(std::move(tmp.name)); + } + + return r; + } + + void associate_code_object_symbols_with_host_allocation( + const ELFIO::elfio& reader, + ELFIO::section* code_object_dynsym, + hsa_agent_t agent, + hsa_executable_t executable) { + if (!code_object_dynsym) return; + + const auto undefined_symbols = copy_names_of_undefined_symbols( + ELFIO::symbol_section_accessor{reader, code_object_dynsym}); + + auto& g = get_globals(); + auto& g_mutex = get_globals_mutex(); + for (auto&& x : undefined_symbols) { + + if (g.find(x) != g.cend()) return; + + const auto it1 = get_symbol_addresses().find(x); + + if (it1 == get_symbol_addresses().cend()) { + hip_throw(std::runtime_error{ + "Global symbol: " + x + " is undefined."}); + } + + std::lock_guard lck{g_mutex}; + + if (g.find(x) != g.cend()) return; + + g.emplace(x, (void*)(it1->second.first)); + void* p = nullptr; + hsa_amd_memory_lock( + reinterpret_cast(it1->second.first), + it1->second.second, + nullptr, // All agents. + 0, + &p); + + hsa_executable_agent_global_variable_define( + executable, agent, x.c_str(), p); + } + } + + void load_code_object_and_freeze_executable( + const std::string& file, hsa_agent_t agent, hsa_executable_t executable) { + // TODO: the following sequence is inefficient, should be refactored + // into a single load of the file and subsequent ELFIO + // processing. + if (file.empty()) return; + + static const auto cor_deleter = [] (hsa_code_object_reader_t* p) { + if (!p) return; + hsa_code_object_reader_destroy(*p); + delete p; + }; + + RAII_code_reader tmp{new hsa_code_object_reader_t, cor_deleter}; + hsa_code_object_reader_create_from_memory( + file.data(), file.size(), tmp.get()); + + hsa_executable_load_agent_code_object( + executable, agent, *tmp, nullptr, nullptr); + + hsa_executable_freeze(executable, nullptr); + + std::lock_guard lck{code_readers.first}; + code_readers.second.push_back(move(tmp)); + } + + + const std::vector& get_executables(hsa_agent_t agent) { + + if (executables.find(agent) == executables.cend()) { + hip_throw(std::runtime_error{"invalid agent"}); + } + + std::call_once(executables[agent].first, [this](hsa_agent_t aa) { + auto data = std::make_pair(this, &aa); + hsa_agent_iterate_isas(aa, [](hsa_isa_t x, void* d) { + auto& p = *static_cast(d); + auto& impl = *(p.first); + for (const auto code_object_it : impl.get_code_object_blobs()) { + const auto elf = code_object_it.first; + const auto code_object_blobs = code_object_it.second; + const auto it = code_object_blobs.find(x); + + if (it == code_object_blobs.cend()) continue; + + hsa_agent_t a = *static_cast(p.second); + + std::lock_guard lck{executables_cache_mutex}; + + std::vector& current_exes = + hip_impl::executables_cache(elf, x, a); + // check the cache for already loaded executables + if (current_exes.empty()) { + // executables do not yet exist for this elf+isa+agent, create and cache them + for (auto&& blob : it->second) { + hsa_executable_t tmp = {}; + + hsa_executable_create_alt( + HSA_PROFILE_FULL, + HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, + nullptr, + &tmp); + + // TODO: this is massively inefficient and only meant for + // illustration. + tmp = impl.load_executable(blob.data(), blob.size(), tmp, a); + + if (tmp.handle) current_exes.push_back(tmp); + } + } + // append cached executables to our agent's vector of executables + impl.executables[a].second.insert(impl.executables[a].second.end(), + current_exes.begin(), current_exes.end()); + } + return HSA_STATUS_SUCCESS; + }, &data); + }, agent); + + return executables[agent].second; + } + + hsa_executable_t load_executable(const char* data, + const size_t data_size, + hsa_executable_t executable, + hsa_agent_t agent) { + ELFIO::elfio reader; + std::string ts = std::string(data, data_size); + std::stringstream tmp{ts}; + + if (!reader.load(tmp)) return hsa_executable_t{}; + const auto code_object_dynsym = find_section_if( + reader, [](const ELFIO::section* x) { + return x->get_type() == SHT_DYNSYM; + }); + + associate_code_object_symbols_with_host_allocation(reader, + code_object_dynsym, + agent, executable); + + load_code_object_and_freeze_executable(ts, agent, executable); + + return executable; + } + + std::vector> function_names_for( + const ELFIO::elfio& reader, ELFIO::section* symtab) { + std::vector> r; + ELFIO::symbol_section_accessor symbols{reader, symtab}; + + for (auto i = 0u; i != symbols.get_symbols_num(); ++i) { + // TODO: this is boyscout code, caching the temporaries + // may be of worth. + auto tmp = read_symbol(symbols, i); + + if (tmp.type != STT_FUNC) continue; + if (tmp.type == SHN_UNDEF) continue; + if (tmp.name.empty()) continue; + + r.emplace_back(tmp.value, tmp.name); + } + + return r; + } + + const std::unordered_map& get_function_names() { + + std::call_once(function_names.first, [this]() { + dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void* p) { + ELFIO::elfio tmp; + const auto elf = + info->dlpi_addr ? info->dlpi_name : "/proc/self/exe"; + + if (!tmp.load(elf)) return 0; + + const auto it = find_section_if(tmp, [](const ELFIO::section* x) { + return x->get_type() == SHT_SYMTAB; + }); + + if (!it) return 0; + + auto& impl = *static_cast(p); + + auto names = impl.function_names_for(tmp, it); + for (auto&& x : names) x.first += info->dlpi_addr; + + impl.function_names.second.insert( + std::make_move_iterator(names.begin()), + std::make_move_iterator(names.end())); + + return 0; + }, this); + }); + + return function_names.second; + } + + const std::unordered_map< + std::string, std::vector>& get_kernels(hsa_agent_t agent) { + + if (kernels.find(agent) == kernels.cend()) { + hip_throw(std::runtime_error{"invalid agent"}); + } + + std::call_once(kernels[agent].first, [this](hsa_agent_t aa) { + static const auto copy_kernels = []( + hsa_executable_t, hsa_agent_t a, hsa_executable_symbol_t x, void* p) { + auto& impl = *static_cast(p); + if (type(x) == HSA_SYMBOL_KIND_KERNEL) impl.kernels[a].second[hip_impl::name(x)].push_back(x); + + return HSA_STATUS_SUCCESS; + }; + + for (auto&& executable : get_executables(aa)) { + hsa_executable_iterate_agent_symbols( + executable, aa, copy_kernels, this); + } + }, agent); + + return kernels[agent].second; + } + + const std::unordered_map< + std::uintptr_t, + Kernel_descriptor>& get_functions(hsa_agent_t agent) { + + if (functions.find(agent) == functions.cend()) { + hip_throw(std::runtime_error{"invalid agent"}); + } + + std::call_once(functions[agent].first, [this](hsa_agent_t aa) { + for (auto&& function : get_function_names()) { + const auto it = get_kernels(aa).find(function.second); + + if (it == get_kernels(aa).cend()) continue; + + for (auto&& kernel_symbol : it->second) { + functions[aa].second.emplace( + function.first, + Kernel_descriptor{kernel_object(kernel_symbol), it->first}); + } + } + }, agent); + + return functions[agent].second; + } + + std::size_t parse_args( + const std::string& metadata, + std::size_t f, + std::size_t l, + std::vector>& size_align) { + if (f == l) return f; + if (!size_align.empty()) return l; + + do { + static constexpr size_t size_sz{5}; + f = metadata.find("Size:", f) + size_sz; + + if (l <= f) return f; + + auto size = std::strtoul(&metadata[f], nullptr, 10); + + static constexpr size_t align_sz{6}; + f = metadata.find("Align:", f) + align_sz; + + char* l{}; + auto align = std::strtoul(&metadata[f], &l, 10); + + f += (l - &metadata[f]) + 1; + + size_align.emplace_back(size, align); + } while (true); + } + + void read_kernarg_metadata( + ELFIO::elfio& reader, + std::unordered_map< + std::string, + std::vector>>& kernargs) { + // TODO: this is inefficient. + auto it = find_section_if(reader, [](const ELFIO::section* x) { + return x->get_type() == SHT_NOTE; + }); + + if (!it) return; + + const ELFIO::note_section_accessor acc{reader, it}; + for (decltype(acc.get_notes_num()) i = 0; i != acc.get_notes_num(); ++i) { + ELFIO::Elf_Word type{}; + std::string name{}; + void* desc{}; + ELFIO::Elf_Word desc_size{}; + + acc.get_note(i, type, name, desc, desc_size); + + if (name != "AMD") continue; // TODO: switch to using NT_AMD_AMDGPU_HSA_METADATA. + + std::string tmp{ + static_cast(desc), static_cast(desc) + desc_size}; + + auto dx = tmp.find("Kernels:"); + + if (dx == std::string::npos) continue; + + static constexpr decltype(tmp.size()) kernels_sz{8}; + dx += kernels_sz; + + do { + dx = tmp.find("Name:", dx); + + if (dx == std::string::npos) break; + + static constexpr decltype(tmp.size()) name_sz{5}; + dx = tmp.find_first_not_of(" '", dx + name_sz); + + auto fn = tmp.substr(dx, tmp.find_first_of("'\n", dx) - dx); + dx += fn.size(); + + auto dx1 = tmp.find("CodeProps", dx); + dx = tmp.find("Args:", dx); + + if (dx1 < dx) { + dx = dx1; + continue; + } + if (dx == std::string::npos) break; + + static constexpr decltype(tmp.size()) args_sz{5}; + dx = parse_args(tmp, dx + args_sz, dx1, kernargs[fn]); + } while (true); + } + } + + const std::unordered_map>>& get_kernargs() { + + std::call_once(kernargs.first, [this]() { + for (auto&& name_and_isa_blobs : get_code_object_blobs()) { + for (auto&& isa_blobs : name_and_isa_blobs.second) { + for (auto&& blob : isa_blobs.second) { + std::stringstream tmp{std::string{blob.cbegin(), blob.cend()}}; + + ELFIO::elfio reader; + + if (!reader.load(tmp)) continue; + + read_kernarg_metadata(reader, kernargs.second); + } + } + } + }); + + return kernargs.second; + } + + std::string name(std::uintptr_t function_address) + { + const auto it = get_function_names().find(function_address); + + if (it == get_function_names().cend()) { + hip_throw(std::runtime_error{ + "Invalid function passed to hipLaunchKernelGGL."}); + } + + return it->second; + } + + std::string name(hsa_agent_t agent) + { + char n[64]{}; + hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, n); + + return std::string{n}; + } + + const Kernel_descriptor& kernel_descriptor(std::uintptr_t function_address, + hsa_agent_t agent) { + + auto it0 = get_functions(agent).find(function_address); + + if (it0 == get_functions(agent).cend()) { + hip_throw(std::runtime_error{ + "No device code available for function: " + + std::string(name(function_address)) + + ", for agent: " + name(agent)}); + } + + return it0->second; + } + + const std::vector>& + kernargs_size_align(std::uintptr_t kernel) { + + auto it = get_function_names().find(kernel); + if (it == get_function_names().cend()) { + hip_throw(std::runtime_error{"Undefined __global__ function."}); + } + + auto it1 = get_kernargs().find(it->second); + if (it1 == get_kernargs().end()) { + hip_throw(std::runtime_error{ + "Missing metadata for __global__ function: " + it->second}); + } + + return it1->second; + } +}; // class program_state_impl + +};