diff --git a/projects/hip/include/hip/hcc_detail/functional_grid_launch.hpp b/projects/hip/include/hip/hcc_detail/functional_grid_launch.hpp index e678f25aa2..2fbda48629 100644 --- a/projects/hip/include/hip/hcc_detail/functional_grid_launch.hpp +++ b/projects/hip/include/hip/hcc_detail/functional_grid_launch.hpp @@ -33,6 +33,7 @@ THE SOFTWARE. #include #include +#include #include #include #include @@ -56,7 +57,9 @@ template < typename... Ts, typename std::enable_if::type* = nullptr> inline std::vector make_kernarg( - std::vector kernarg, const std::tuple&) { + const std::tuple&, + const std::vector>&, + std::vector kernarg) { return kernarg; } @@ -65,7 +68,9 @@ template < typename... Ts, typename std::enable_if::type* = nullptr> inline std::vector make_kernarg( - std::vector kernarg, const std::tuple& formals) { + const std::tuple& formals, + const std::vector>& size_align, + std::vector kernarg) { using T = typename std::tuple_element>::type; static_assert( @@ -80,24 +85,48 @@ inline std::vector make_kernarg( #endif kernarg.resize(round_up_to_next_multiple_nonnegative( - kernarg.size(), alignof(T)) + sizeof(T)); + kernarg.size(), size_align[n].second) + size_align[n].first); - new (kernarg.data() + kernarg.size() - sizeof(T)) T{std::get(formals)}; + std::memcpy( + kernarg.data() + kernarg.size() - size_align[n].first, + &std::get(formals), + size_align[n].first); - return make_kernarg(std::move(kernarg), formals); + return make_kernarg(formals, size_align, std::move(kernarg)); } template inline std::vector make_kernarg( - void (*)(Formals...), std::tuple actuals) { + void (*kernel)(Formals...), std::tuple actuals) { static_assert(sizeof...(Formals) == sizeof...(Actuals), "The count of formal arguments must match the count of actuals."); + if (sizeof...(Formals) == 0) return {}; + + auto it = function_names().find(reinterpret_cast(kernel)); + if (it == function_names().cend()) { + it = + function_names(true).find(reinterpret_cast(kernel)); + if (it == function_names().cend()) { + throw std::runtime_error{"Undefined __global__ function."}; + } + } + + auto it1 = kernargs().find(it->second); + if (it1 == kernargs().end()) { + it1 = kernargs(true).find(it->second); + + if (it1 == kernargs().end()) { + throw std::runtime_error{ + "Missing metadata for __global__ function: " + it->second}; + } + } + std::tuple to_formals{std::move(actuals)}; std::vector kernarg; kernarg.reserve(sizeof(to_formals)); - return make_kernarg<0>(std::move(kernarg), to_formals); + return make_kernarg<0>(to_formals, it1->second, std::move(kernarg)); } void hipLaunchKernelGGLImpl(std::uintptr_t function_address, const dim3& numBlocks, diff --git a/projects/hip/include/hip/hcc_detail/program_state.hpp b/projects/hip/include/hip/hcc_detail/program_state.hpp index bdb87b3509..da13c7c3db 100644 --- a/projects/hip/include/hip/hcc_detail/program_state.hpp +++ b/projects/hip/include/hip/hcc_detail/program_state.hpp @@ -99,6 +99,9 @@ const std::unordered_map& function_names(bool rebuild = false); std::unordered_map& globals(bool rebuild = false); +const std::unordered_map< + std::string, std::vector>>& + kernargs(bool rebuild = false); hsa_executable_t load_executable(const std::string& file, hsa_executable_t executable, hsa_agent_t agent); diff --git a/projects/hip/src/program_state.cpp b/projects/hip/src/program_state.cpp index 88cdeeb404..bb906b0ad9 100644 --- a/projects/hip/src/program_state.cpp +++ b/projects/hip/src/program_state.cpp @@ -340,6 +340,90 @@ void load_code_object_and_freeze_executable( code_readers.push_back(move(tmp)); } } + +size_t parse_args( + const string& metadata, + size_t f, + size_t l, + vector>& size_align) { + if (f == l) return f; + if (!size_align.empty()) return l; + + do { + static constexpr size_t size_sz{5}; + f = metadata.find("Size:", f) + size_sz; + + if (l <= f) return f; + + auto size = strtoul(&metadata[f], nullptr, 10); + + static constexpr size_t align_sz{6}; + f = metadata.find("Align:", f) + align_sz; + + char* l{}; + auto align = strtoul(&metadata[f], &l, 10); + + f += (l - &metadata[f]) + 1; + + size_align.emplace_back(size, align); + } while (true); +} + +void read_kernarg_metadata( + elfio& reader, + unordered_map>>& kernargs) +{ // TODO: this is inefficient. + auto it = find_section_if( + reader, [](const section* x) { return x->get_type() == SHT_NOTE; }); + + if (!it) return; + + const note_section_accessor acc{reader, it}; + for (decltype(acc.get_notes_num()) i = 0; i != acc.get_notes_num(); ++i) { + ELFIO::Elf_Word type{}; + string name{}; + void* desc{}; + Elf_Word desc_size{}; + + acc.get_note(i, type, name, desc, desc_size); + + if (name != "AMD") continue; // TODO: switch to using NT_AMD_AMDGPU_HSA_METADATA. + + string tmp{ + static_cast(desc), static_cast(desc) + desc_size}; + + auto dx = tmp.find("Kernels:"); + + if (dx == string::npos) continue; + + static constexpr decltype(tmp.size()) kernels_sz{8}; + dx += kernels_sz; + + do { + dx = tmp.find("Name:", dx); + + if (dx == string::npos) break; + + static constexpr decltype(tmp.size()) name_sz{5}; + dx = tmp.find_first_not_of(" '", dx + name_sz); + + auto fn = tmp.substr(dx, tmp.find_first_of("'\n", dx) - dx); + dx += fn.size(); + + auto dx1 = tmp.find("CodeProps", dx); + dx = tmp.find("Args:", dx); + + if (dx1 < dx) { + dx = dx1; + continue; + } + if (dx == string::npos) break; + + static constexpr decltype(tmp.size()) args_sz{5}; + dx = parse_args(tmp, dx + args_sz, dx1, kernargs[fn]); + } while (true); + } +} } // namespace namespace hip_impl { @@ -454,6 +538,7 @@ const unordered_map>>& fu // created previously function_names(rebuild); + kernargs(rebuild); kernels(rebuild); globals(rebuild); } @@ -501,6 +586,45 @@ unordered_map& globals(bool rebuild) { return r; } +const unordered_map>>& kernargs( + bool rebuild) { + static unordered_map>> r; + static once_flag f; + + static const auto build_map = [](decltype(r)& x) { + for (auto&& isa_blobs : code_object_blobs()) { + for (auto&& blob : isa_blobs.second) { + stringstream tmp{std::string{blob.cbegin(), blob.cend()}}; + + elfio reader; + if (!reader.load(tmp)) continue; + + read_kernarg_metadata(reader, x); + } + } + }; + call_once(f, []() { r.reserve(function_names().size()); build_map(r); }); + + if (rebuild) { + static mutex mtx; + thread_local static decltype(r) tmp; + + { + lock_guard lck{mtx}; + + tmp.insert(r.cbegin(), r.cend()); // Should use merge in C++17. + } + + build_map(tmp); + + lock_guard lck{mtx}; + + r.insert(tmp.cbegin(), tmp.cend()); + } + + return r; +} + hsa_executable_t load_executable(const string& file, hsa_executable_t executable, hsa_agent_t agent) { elfio reader;