From 9d088d22836e663df60ce66e08d3fdd28cf636fc Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Tue, 21 Nov 2017 02:40:34 +0000 Subject: [PATCH] Refactor the __device__ versions of memset and memcpy to be less awkward i.e. not return nullptr as opposed to the destination pointer (it can only be assumed it was done for maximum confusion) and actually unroll as they claim to. Change all of the {to, from}Symbol functions to use hipModuleGetGlobal, as opposed to hc::accelerator::get_symbol_address which is no longer valid with module based dispatch. --- include/hip/hcc_detail/program_state.hpp | 20 ++++ src/device_util.cpp | 49 +++++++--- src/hip_memory.cpp | 24 +++-- src/hip_module.cpp | 119 ++++++++++++++++------- src/program_state.cpp | 107 +++++++++----------- 5 files changed, 206 insertions(+), 113 deletions(-) diff --git a/include/hip/hcc_detail/program_state.hpp b/include/hip/hcc_detail/program_state.hpp index 03701725eb..0e21b12f5f 100644 --- a/include/hip/hcc_detail/program_state.hpp +++ b/include/hip/hcc_detail/program_state.hpp @@ -35,6 +35,24 @@ THE SOFTWARE. struct ihipModuleSymbol_t; using hipFunction_t = ihipModuleSymbol_t*; +namespace std +{ + template<> + struct hash { + size_t operator()(hsa_agent_t x) const + { + return hash{}(x.handle); + } + }; +} + +inline +constexpr +bool operator==(hsa_agent_t x, hsa_agent_t y) +{ + return x.handle == y.handle; +} + namespace hip_impl { struct Kernel_descriptor { @@ -50,6 +68,8 @@ namespace hip_impl } }; + const std::unordered_map< + hsa_agent_t, std::vector>& executables(); const std::unordered_map< std::uintptr_t, std::vector>>& functions(); diff --git a/src/device_util.cpp b/src/device_util.cpp index 367a4c1a4f..b6aebdfce0 100644 --- a/src/device_util.cpp +++ b/src/device_util.cpp @@ -102,23 +102,48 @@ __device__ void* __hip_hc_free(void *ptr) // loop unrolling __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) { - uint8_t *dstPtr, *srcPtr; - dstPtr = (uint8_t*)dst; - srcPtr = (uint8_t*)src; - for(uint32_t i=0;i(dst); + auto srcPtr = static_cast(src); + + while (size >= 4u) { + dstPtr[0] = srcPtr[0]; + dstPtr[1] = srcPtr[1]; + dstPtr[2] = srcPtr[2]; + dstPtr[3] = srcPtr[3]; + + size -= 4u; + srcPtr += 4u; + dstPtr += 4u; } - return nullptr; + switch (size) { + case 3: dstPtr[2] = srcPtr[2]; + case 2: dstPtr[1] = srcPtr[1]; + case 1: dstPtr[0] = srcPtr[0]; + } + + return dst; } -__device__ void* __hip_hc_memset(void* ptr, uint8_t val, size_t size) +__device__ void* __hip_hc_memset(void* dst, uint8_t val, size_t size) { - uint8_t *dstPtr; - dstPtr = (uint8_t*)ptr; - for(uint32_t i=0;i(dst); + + while (size >= 4u) { + dstPtr[0] = val; + dstPtr[1] = val; + dstPtr[2] = val; + dstPtr[3] = val; + + size -= 4u; + dstPtr += 4u; } - return nullptr; + switch (size) { + case 3: dstPtr[2] = val; + case 2: dstPtr[1] = val; + case 1: dstPtr[0] = val; + } + + return dst; } __device__ float __hip_erfinvf(float x){ diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index 047cf76c08..04ea38fcd5 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -715,7 +715,10 @@ hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t cou hc::accelerator acc = ctx->getDevice()->_acc; - void *dst = acc.get_symbol_address((const char*) symbolName); + hipDeviceptr_t dst = nullptr; + size_t byte_cnt = 0u; + auto status = hipModuleGetGlobal( + &dst, &byte_cnt, 0, static_cast(symbolName)); tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst); if(dst == nullptr) @@ -750,7 +753,10 @@ hipError_t hipMemcpyFromSymbol(void* dst, const void* symbolName, size_t count, hc::accelerator acc = ctx->getDevice()->_acc; - void *src = acc.get_symbol_address((const char*) symbolName); + hipDeviceptr_t src = nullptr; + size_t byte_cnt = 0u; + auto status = hipModuleGetGlobal( + &src, &byte_cnt, 0, static_cast(symbolName)); tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst); if(dst == nullptr) @@ -787,7 +793,10 @@ hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void *src, size_ hc::accelerator acc = ctx->getDevice()->_acc; - void *dst = acc.get_symbol_address((const char*) symbolName); + hipDeviceptr_t dst = nullptr; + size_t byte_cnt = 0u; + auto status = hipModuleGetGlobal( + &dst, &byte_cnt, 0, static_cast(symbolName)); tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst); if(dst == nullptr) @@ -825,7 +834,10 @@ hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName, size_t co hc::accelerator acc = ctx->getDevice()->_acc; - void *src = acc.get_symbol_address((const char*) symbolName); + hipDeviceptr_t src = nullptr; + size_t byte_cnt = 0u; + auto status = hipModuleGetGlobal( + &src, &byte_cnt, 0, static_cast(symbolName)); tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, src); if(src == nullptr || dst == nullptr) @@ -1171,9 +1183,9 @@ namespace __global__ void hip_fill_n(RandomAccessIterator f, N n, T value) { - const uint32_t grid_dim = hipGridDim_x; + const uint32_t grid_dim = gridDim.x * blockDim.x; - size_t idx = hipBlockIdx_x * block_dim + hipThreadIdx_x; + size_t idx = blockIdx.x * block_dim + threadIdx.x; while (idx < n) { new (&f[idx]) T{value}; idx += grid_dim; diff --git a/src/hip_module.cpp b/src/hip_module.cpp index df847f9f64..fb1cf29df8 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -554,16 +554,93 @@ namespace } inline - std::vector read_agent_globals(hipModule_t hmodule) + std::vector read_agent_globals( + hsa_agent_t agent, hsa_executable_t executable) { std::vector r; - hsa_executable_iterate_agent_symbols( - hmodule->executable, this_agent(), copy_agent_global_variables, &r); + executable, agent, copy_agent_global_variables, &r); return r; } + + template + std::pair read_global_description( + ForwardIterator f, ForwardIterator l, const char* name) + { + const auto it = std::find_if( + f, l, [=](const Agent_global& x) { return x.name == name; }); + + return it == l ? + std::make_pair(nullptr, 0u) : + std::make_pair(it->address, it->byte_cnt); + } + + hipError_t read_agent_global_from_module( + hipDeviceptr_t *dptr, + size_t* bytes, + hipModule_t hmod, + const char* name) + { + static std::unordered_map< + hipModule_t, std::vector> agent_globals; + + // TODO: this is not particularly robust. + if (agent_globals.count(hmod) == 0) { + static std::mutex mtx; + std::lock_guard lck{mtx}; + + if (agent_globals.count(hmod) == 0) { + agent_globals.emplace( + hmod, read_agent_globals(this_agent(), hmod->executable)); + } + } + + // TODO: This is unsafe iff some other emplacement triggers rehashing. + // It will have to be properly fleshed out in the future. + const auto it0 = agent_globals.find(hmod); + if (it0 == agent_globals.cend()) { + throw std::runtime_error{"agent_globals data structure corrupted."}; + } + + std::tie(*dptr, *bytes) = read_global_description( + it0->second.cbegin(), it0->second.cend(), name); + + return dptr ? hipSuccess : hipErrorNotFound; + } + + hipError_t read_agent_global_from_process( + hipDeviceptr_t *dptr, size_t* bytes, const char* name) + { + static std::unordered_map< + hsa_agent_t, std::vector> agent_globals; + static std::once_flag f; + + std::call_once(f, []() { + for (auto&& agent_executables : hip_impl::executables()) { + std::vector tmp0; + for (auto&& executable : agent_executables.second) { + auto tmp1 = read_agent_globals( + agent_executables.first, executable); + tmp0.insert( + tmp0.end(), + std::make_move_iterator(tmp1.begin()), + std::make_move_iterator(tmp1.end())); + } + agent_globals.emplace(agent_executables.first, std::move(tmp0)); + } + }); + + const auto it = agent_globals.find(this_agent()); + + if (it == agent_globals.cend()) return hipErrorNotInitialized; + + std::tie(*dptr, *bytes) = read_global_description( + it->second.cbegin(), it->second.cend(), name); + + return dptr ? hipSuccess : hipErrorNotFound; + } } hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, @@ -574,41 +651,15 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, if(dptr == NULL || bytes == NULL){ return ihipLogStatus(hipErrorInvalidValue); } - if(name == NULL || hmod == NULL){ + if(name == NULL){ return ihipLogStatus(hipErrorNotInitialized); } else{ - static std::unordered_map< - hipModule_t, std::vector> agent_globals; + ret = hmod ? + read_agent_global_from_module(dptr, bytes, hmod, name) : + read_agent_global_from_process(dptr, bytes, name); - // TODO: this is not particularly robust. - if (agent_globals.count(hmod) == 0) { - static std::mutex mtx; - std::lock_guard lck{mtx}; - - if (agent_globals.count(hmod) == 0) { - agent_globals.emplace(hmod, read_agent_globals(hmod)); - } - } - - // TODO: This is unsafe iff some other emplacement triggers rehashing. - // It will have to be properly fleshed out in the future. - const auto it0 = agent_globals.find(hmod); - if (it0 == agent_globals.cend()) { - throw std::runtime_error{"agent_globals data structure corrupted."}; - } - - const auto it1 = std::find_if( - it0->second.cbegin(), - it0->second.cend(), - [=](const Agent_global& x) { return x.name == name; }); - - if (it1 == it0->second.cend()) return ihipLogStatus(hipErrorNotFound); - - *dptr = it1->address; - *bytes = it1->byte_cnt; - - return ihipLogStatus(hipSuccess); + return ihipLogStatus(ret); } } diff --git a/src/program_state.cpp b/src/program_state.cpp index d5e2f80a05..2bb115981b 100644 --- a/src/program_state.cpp +++ b/src/program_state.cpp @@ -31,14 +31,6 @@ using namespace std; namespace std { - template<> - struct hash { - size_t operator()(hsa_agent_t x) const - { - return hash{}(x.handle); - } - }; - template<> struct hash { size_t operator()(hsa_isa_t x) const @@ -48,13 +40,6 @@ namespace std }; } -inline -constexpr -bool operator==(hsa_agent_t x, hsa_agent_t y) -{ - return x.handle == y.handle; -} - inline constexpr bool operator==(hsa_isa_t x, hsa_isa_t y) @@ -242,52 +227,6 @@ namespace return r; } - const unordered_map>& executables() - { - static unordered_map> r; - static once_flag f; - - call_once(f, []() { - static const auto accelerators = hc::accelerator::get_all(); - - for (auto&& acc : accelerators) { - auto agent = static_cast(acc.get_hsa_agent()); - - if (!agent) continue; - - hsa_agent_iterate_isas(*agent, [](hsa_isa_t x, void* pa) { - const auto it = code_object_blobs().find(x); - - if (it != code_object_blobs().cend()) { - hsa_agent_t a = *static_cast(pa); - - for (auto&& blob : it->second) { - hsa_executable_t tmp = {}; - - hsa_executable_create_alt( - HSA_PROFILE_FULL, - HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, - nullptr, - &tmp); - - // TODO: this is massively inefficient and only - // meant for illustration. - string blob_to_str{blob.cbegin(), blob.cend()}; - stringstream istr{blob_to_str}; - tmp = load_executable(tmp, a, istr); - - if (tmp.handle) r[a].push_back(tmp); - } - } - - return HSA_STATUS_SUCCESS; - }, agent); - } - }); - - return r; - } - vector> function_names_for( const elfio& reader, section* symtab) { @@ -467,6 +406,52 @@ namespace namespace hip_impl { + const unordered_map>& executables() + { + static unordered_map> r; + static once_flag f; + + call_once(f, []() { + static const auto accelerators = hc::accelerator::get_all(); + + for (auto&& acc : accelerators) { + auto agent = static_cast(acc.get_hsa_agent()); + + if (!agent) continue; + + hsa_agent_iterate_isas(*agent, [](hsa_isa_t x, void* pa) { + const auto it = code_object_blobs().find(x); + + if (it != code_object_blobs().cend()) { + hsa_agent_t a = *static_cast(pa); + + for (auto&& blob : it->second) { + hsa_executable_t tmp = {}; + + hsa_executable_create_alt( + HSA_PROFILE_FULL, + HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, + nullptr, + &tmp); + + // TODO: this is massively inefficient and only + // meant for illustration. + string blob_to_str{blob.cbegin(), blob.cend()}; + stringstream istr{blob_to_str}; + tmp = load_executable(tmp, a, istr); + + if (tmp.handle) r[a].push_back(tmp); + } + } + + return HSA_STATUS_SUCCESS; + }, agent); + } + }); + + return r; + } + const unordered_map& function_names() { static unordered_map r{