From 299b9d13821240eb724dce4cd104763cb5f229f4 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Sun, 28 Oct 2018 17:01:00 +0000 Subject: [PATCH] Rely on code object metadat for kernarg arguments alignof and sizeof. --- .../hip/hcc_detail/functional_grid_launch.hpp | 37 +++++-- .../include/hip/hcc_detail/program_state.hpp | 2 + hipamd/src/program_state.cpp | 102 +++++++++++++++++- 3 files changed, 132 insertions(+), 9 deletions(-) diff --git a/hipamd/include/hip/hcc_detail/functional_grid_launch.hpp b/hipamd/include/hip/hcc_detail/functional_grid_launch.hpp index 66e5873f3a..3a19965974 100644 --- a/hipamd/include/hip/hcc_detail/functional_grid_launch.hpp +++ b/hipamd/include/hip/hcc_detail/functional_grid_launch.hpp @@ -33,6 +33,7 @@ THE SOFTWARE. #include #include +#include #include #include #include @@ -56,7 +57,9 @@ template < typename... Ts, typename std::enable_if::type* = nullptr> inline std::vector make_kernarg( - std::vector kernarg, const std::tuple&) { + const std::tuple&, + const std::vector>&, + std::vector kernarg) { return kernarg; } @@ -65,7 +68,9 @@ template < typename... Ts, typename std::enable_if::type* = nullptr> inline std::vector make_kernarg( - std::vector kernarg, const std::tuple& formals) { + const std::tuple& formals, + const std::vector>& size_align, + std::vector kernarg) { using T = typename std::tuple_element>::type; static_assert( @@ -80,24 +85,42 @@ inline std::vector make_kernarg( #endif kernarg.resize(round_up_to_next_multiple_nonnegative( - kernarg.size(), alignof(T)) + sizeof(T)); + kernarg.size(), size_align[n].second) + + size_align[n].first); - new (kernarg.data() + kernarg.size() - sizeof(T)) T{std::get(formals)}; + std::memcpy( + kernarg.data() + kernarg.size() - size_align[n].first, + &std::get(formals), + size_align[n].first); - return make_kernarg(std::move(kernarg), formals); + return make_kernarg(formals, size_align, std::move(kernarg)); } template inline std::vector make_kernarg( - void (*)(Formals...), std::tuple actuals) { + void (*kernel)(Formals...), std::tuple actuals) { static_assert(sizeof...(Formals) == sizeof...(Actuals), "The count of formal arguments must match the count of actuals."); + const auto it = function_names().find( + reinterpret_cast(kernel)); + + if (it == function_names().cend()) { + throw std::runtime_error{"Undefined __global__ function."}; + } + + const auto it1 = kernargs().find(it->second); + + if (it1 == kernargs().end()) { + throw std::runtime_error{ + "Missing metadata for __global__ function: " + it->second}; + } + std::tuple to_formals{std::move(actuals)}; std::vector kernarg; kernarg.reserve(sizeof(to_formals)); - return make_kernarg<0>(std::move(kernarg), to_formals); + return make_kernarg<0>(to_formals, it1->second, std::move(kernarg)); } void hipLaunchKernelGGLImpl(std::uintptr_t function_address, const dim3& numBlocks, diff --git a/hipamd/include/hip/hcc_detail/program_state.hpp b/hipamd/include/hip/hcc_detail/program_state.hpp index bdb87b3509..92bef22172 100644 --- a/hipamd/include/hip/hcc_detail/program_state.hpp +++ b/hipamd/include/hip/hcc_detail/program_state.hpp @@ -99,6 +99,8 @@ const std::unordered_map& function_names(bool rebuild = false); std::unordered_map& globals(bool rebuild = false); +std::unordered_map< + std::string, std::vector>>& kernargs(); hsa_executable_t load_executable(const std::string& file, hsa_executable_t executable, hsa_agent_t agent); diff --git a/hipamd/src/program_state.cpp b/hipamd/src/program_state.cpp index 8766134582..43ceedee7b 100644 --- a/hipamd/src/program_state.cpp +++ b/hipamd/src/program_state.cpp @@ -306,8 +306,8 @@ const unordered_map>& kernels(bool rebui void load_code_object_and_freeze_executable( const string& file, hsa_agent_t agent, - hsa_executable_t - executable) { // TODO: the following sequence is inefficient, should be refactored + hsa_executable_t executable) { + // TODO: the following sequence is inefficient, should be refactored // into a single load of the file and subsequent ELFIO // processing. static const auto cor_deleter = [](hsa_code_object_reader_t* p) { @@ -334,6 +334,85 @@ void load_code_object_and_freeze_executable( code_readers.push_back(move(tmp)); } } + +size_t parse_args( + const string& metadata, + size_t f, + size_t l, + vector>& size_align) { + if (f == l) return f; + + do { + static constexpr size_t size_sz{5}; + f = metadata.find("Size:", f) + size_sz; + + if (l <= f) return f; + + auto size = strtoul(&metadata[f], nullptr, 10); + + static constexpr size_t align_sz{6}; + f = metadata.find("Align:", f) + align_sz; + + char* l{}; + auto align = strtoul(&metadata[f], &l, 10); + + f += (l - &metadata[f]) + 1; + + size_align.emplace_back(size, align); + } while (true); +} + +void read_kernarg_metadata( + elfio& reader, + unordered_map>>& kernargs) +{ // TODO: this is inefficient. + auto it = find_section_if( + reader, [](const section* x) { return x->get_type() == SHT_NOTE; }); + + if (!it) return; + + const note_section_accessor acc{reader, it}; + for (decltype(acc.get_notes_num()) i = 0; i != acc.get_notes_num(); ++i) { + ELFIO::Elf_Word type{}; + string name{}; + void* desc{}; + Elf_Word desc_size{}; + + acc.get_note(i, type, name, desc, desc_size); + + if (name != "AMD") continue; // TODO: switch to using NT_AMD_AMDGPU_HSA_METADATA. + + string tmp{ + static_cast(desc), static_cast(desc) + desc_size}; + + auto dx = tmp.find("Kernels:"); + + if (dx == string::npos) continue; + + static constexpr decltype(tmp.size()) kernels_sz{8}; + dx += kernels_sz; + + do { + dx = tmp.find("Name:", dx); + + if (dx == string::npos) break; + + static constexpr decltype(tmp.size()) name_sz{5}; + dx = tmp.find_first_not_of(' ', dx + name_sz); + + auto fn = tmp.substr(dx, tmp.find('\n', dx) - dx); + dx += fn.size(); + + dx = tmp.find("Args:", dx); + + if (dx == string::npos) break; + + static constexpr decltype(tmp.size()) args_sz{5}; + dx = parse_args( + tmp, dx + args_sz, tmp.find("CodeProps", dx), kernargs[fn]); + } while (true); + } +} } // namespace namespace hip_impl { @@ -495,6 +574,25 @@ unordered_map& globals(bool rebuild) { return r; } +unordered_map>>& kernargs() { + static unordered_map>> r; + static once_flag f; + + call_once(f, []() { + for (auto&& blob : code_object_blobs()) { + stringstream tmp{std::string{ + blob.second.front().cbegin(), blob.second.front().cend()}}; + + elfio reader; + if (!reader.load(tmp)) continue; + + read_kernarg_metadata(reader, r); + } + }); + + return r; +} + hsa_executable_t load_executable(const string& file, hsa_executable_t executable, hsa_agent_t agent) { elfio reader;