diff --git a/include/hip/hcc_detail/program_state.hpp b/include/hip/hcc_detail/program_state.hpp index 03701725eb..0e21b12f5f 100644 --- a/include/hip/hcc_detail/program_state.hpp +++ b/include/hip/hcc_detail/program_state.hpp @@ -35,6 +35,24 @@ THE SOFTWARE. struct ihipModuleSymbol_t; using hipFunction_t = ihipModuleSymbol_t*; +namespace std +{ + template<> + struct hash { + size_t operator()(hsa_agent_t x) const + { + return hash{}(x.handle); + } + }; +} + +inline +constexpr +bool operator==(hsa_agent_t x, hsa_agent_t y) +{ + return x.handle == y.handle; +} + namespace hip_impl { struct Kernel_descriptor { @@ -50,6 +68,8 @@ namespace hip_impl } }; + const std::unordered_map< + hsa_agent_t, std::vector>& executables(); const std::unordered_map< std::uintptr_t, std::vector>>& functions(); diff --git a/src/device_util.cpp b/src/device_util.cpp index 367a4c1a4f..b6aebdfce0 100644 --- a/src/device_util.cpp +++ b/src/device_util.cpp @@ -102,23 +102,48 @@ __device__ void* __hip_hc_free(void *ptr) // loop unrolling __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) { - uint8_t *dstPtr, *srcPtr; - dstPtr = (uint8_t*)dst; - srcPtr = (uint8_t*)src; - for(uint32_t i=0;i(dst); + auto srcPtr = static_cast(src); + + while (size >= 4u) { + dstPtr[0] = srcPtr[0]; + dstPtr[1] = srcPtr[1]; + dstPtr[2] = srcPtr[2]; + dstPtr[3] = srcPtr[3]; + + size -= 4u; + srcPtr += 4u; + dstPtr += 4u; } - return nullptr; + switch (size) { + case 3: dstPtr[2] = srcPtr[2]; + case 2: dstPtr[1] = srcPtr[1]; + case 1: dstPtr[0] = srcPtr[0]; + } + + return dst; } -__device__ void* __hip_hc_memset(void* ptr, uint8_t val, size_t size) +__device__ void* __hip_hc_memset(void* dst, uint8_t val, size_t size) { - uint8_t *dstPtr; - dstPtr = (uint8_t*)ptr; - for(uint32_t i=0;i(dst); + + while (size >= 4u) { + dstPtr[0] = val; + dstPtr[1] = val; + dstPtr[2] = val; + dstPtr[3] = val; + + size -= 4u; + dstPtr += 4u; } - return nullptr; + switch (size) { + case 3: dstPtr[2] = val; + case 2: dstPtr[1] = val; + case 1: dstPtr[0] = val; + } + + return dst; } __device__ float __hip_erfinvf(float x){ diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index 047cf76c08..04ea38fcd5 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -715,7 +715,10 @@ hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t cou hc::accelerator acc = ctx->getDevice()->_acc; - void *dst = acc.get_symbol_address((const char*) symbolName); + hipDeviceptr_t dst = nullptr; + size_t byte_cnt = 0u; + auto status = hipModuleGetGlobal( + &dst, &byte_cnt, 0, static_cast(symbolName)); tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst); if(dst == nullptr) @@ -750,7 +753,10 @@ hipError_t hipMemcpyFromSymbol(void* dst, const void* symbolName, size_t count, hc::accelerator acc = ctx->getDevice()->_acc; - void *src = acc.get_symbol_address((const char*) symbolName); + hipDeviceptr_t src = nullptr; + size_t byte_cnt = 0u; + auto status = hipModuleGetGlobal( + &src, &byte_cnt, 0, static_cast(symbolName)); tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst); if(dst == nullptr) @@ -787,7 +793,10 @@ hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void *src, size_ hc::accelerator acc = ctx->getDevice()->_acc; - void *dst = acc.get_symbol_address((const char*) symbolName); + hipDeviceptr_t dst = nullptr; + size_t byte_cnt = 0u; + auto status = hipModuleGetGlobal( + &dst, &byte_cnt, 0, static_cast(symbolName)); tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst); if(dst == nullptr) @@ -825,7 +834,10 @@ hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName, size_t co hc::accelerator acc = ctx->getDevice()->_acc; - void *src = acc.get_symbol_address((const char*) symbolName); + hipDeviceptr_t src = nullptr; + size_t byte_cnt = 0u; + auto status = hipModuleGetGlobal( + &src, &byte_cnt, 0, static_cast(symbolName)); tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, src); if(src == nullptr || dst == nullptr) @@ -1171,9 +1183,9 @@ namespace __global__ void hip_fill_n(RandomAccessIterator f, N n, T value) { - const uint32_t grid_dim = hipGridDim_x; + const uint32_t grid_dim = gridDim.x * blockDim.x; - size_t idx = hipBlockIdx_x * block_dim + hipThreadIdx_x; + size_t idx = blockIdx.x * block_dim + threadIdx.x; while (idx < n) { new (&f[idx]) T{value}; idx += grid_dim; diff --git a/src/hip_module.cpp b/src/hip_module.cpp index df847f9f64..fb1cf29df8 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -554,16 +554,93 @@ namespace } inline - std::vector read_agent_globals(hipModule_t hmodule) + std::vector read_agent_globals( + hsa_agent_t agent, hsa_executable_t executable) { std::vector r; - hsa_executable_iterate_agent_symbols( - hmodule->executable, this_agent(), copy_agent_global_variables, &r); + executable, agent, copy_agent_global_variables, &r); return r; } + + template + std::pair read_global_description( + ForwardIterator f, ForwardIterator l, const char* name) + { + const auto it = std::find_if( + f, l, [=](const Agent_global& x) { return x.name == name; }); + + return it == l ? + std::make_pair(nullptr, 0u) : + std::make_pair(it->address, it->byte_cnt); + } + + hipError_t read_agent_global_from_module( + hipDeviceptr_t *dptr, + size_t* bytes, + hipModule_t hmod, + const char* name) + { + static std::unordered_map< + hipModule_t, std::vector> agent_globals; + + // TODO: this is not particularly robust. + if (agent_globals.count(hmod) == 0) { + static std::mutex mtx; + std::lock_guard lck{mtx}; + + if (agent_globals.count(hmod) == 0) { + agent_globals.emplace( + hmod, read_agent_globals(this_agent(), hmod->executable)); + } + } + + // TODO: This is unsafe iff some other emplacement triggers rehashing. + // It will have to be properly fleshed out in the future. + const auto it0 = agent_globals.find(hmod); + if (it0 == agent_globals.cend()) { + throw std::runtime_error{"agent_globals data structure corrupted."}; + } + + std::tie(*dptr, *bytes) = read_global_description( + it0->second.cbegin(), it0->second.cend(), name); + + return dptr ? hipSuccess : hipErrorNotFound; + } + + hipError_t read_agent_global_from_process( + hipDeviceptr_t *dptr, size_t* bytes, const char* name) + { + static std::unordered_map< + hsa_agent_t, std::vector> agent_globals; + static std::once_flag f; + + std::call_once(f, []() { + for (auto&& agent_executables : hip_impl::executables()) { + std::vector tmp0; + for (auto&& executable : agent_executables.second) { + auto tmp1 = read_agent_globals( + agent_executables.first, executable); + tmp0.insert( + tmp0.end(), + std::make_move_iterator(tmp1.begin()), + std::make_move_iterator(tmp1.end())); + } + agent_globals.emplace(agent_executables.first, std::move(tmp0)); + } + }); + + const auto it = agent_globals.find(this_agent()); + + if (it == agent_globals.cend()) return hipErrorNotInitialized; + + std::tie(*dptr, *bytes) = read_global_description( + it->second.cbegin(), it->second.cend(), name); + + return dptr ? hipSuccess : hipErrorNotFound; + } } hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, @@ -574,41 +651,15 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, if(dptr == NULL || bytes == NULL){ return ihipLogStatus(hipErrorInvalidValue); } - if(name == NULL || hmod == NULL){ + if(name == NULL){ return ihipLogStatus(hipErrorNotInitialized); } else{ - static std::unordered_map< - hipModule_t, std::vector> agent_globals; + ret = hmod ? + read_agent_global_from_module(dptr, bytes, hmod, name) : + read_agent_global_from_process(dptr, bytes, name); - // TODO: this is not particularly robust. - if (agent_globals.count(hmod) == 0) { - static std::mutex mtx; - std::lock_guard lck{mtx}; - - if (agent_globals.count(hmod) == 0) { - agent_globals.emplace(hmod, read_agent_globals(hmod)); - } - } - - // TODO: This is unsafe iff some other emplacement triggers rehashing. - // It will have to be properly fleshed out in the future. - const auto it0 = agent_globals.find(hmod); - if (it0 == agent_globals.cend()) { - throw std::runtime_error{"agent_globals data structure corrupted."}; - } - - const auto it1 = std::find_if( - it0->second.cbegin(), - it0->second.cend(), - [=](const Agent_global& x) { return x.name == name; }); - - if (it1 == it0->second.cend()) return ihipLogStatus(hipErrorNotFound); - - *dptr = it1->address; - *bytes = it1->byte_cnt; - - return ihipLogStatus(hipSuccess); + return ihipLogStatus(ret); } } diff --git a/src/program_state.cpp b/src/program_state.cpp index d5e2f80a05..2bb115981b 100644 --- a/src/program_state.cpp +++ b/src/program_state.cpp @@ -31,14 +31,6 @@ using namespace std; namespace std { - template<> - struct hash { - size_t operator()(hsa_agent_t x) const - { - return hash{}(x.handle); - } - }; - template<> struct hash { size_t operator()(hsa_isa_t x) const @@ -48,13 +40,6 @@ namespace std }; } -inline -constexpr -bool operator==(hsa_agent_t x, hsa_agent_t y) -{ - return x.handle == y.handle; -} - inline constexpr bool operator==(hsa_isa_t x, hsa_isa_t y) @@ -242,52 +227,6 @@ namespace return r; } - const unordered_map>& executables() - { - static unordered_map> r; - static once_flag f; - - call_once(f, []() { - static const auto accelerators = hc::accelerator::get_all(); - - for (auto&& acc : accelerators) { - auto agent = static_cast(acc.get_hsa_agent()); - - if (!agent) continue; - - hsa_agent_iterate_isas(*agent, [](hsa_isa_t x, void* pa) { - const auto it = code_object_blobs().find(x); - - if (it != code_object_blobs().cend()) { - hsa_agent_t a = *static_cast(pa); - - for (auto&& blob : it->second) { - hsa_executable_t tmp = {}; - - hsa_executable_create_alt( - HSA_PROFILE_FULL, - HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, - nullptr, - &tmp); - - // TODO: this is massively inefficient and only - // meant for illustration. - string blob_to_str{blob.cbegin(), blob.cend()}; - stringstream istr{blob_to_str}; - tmp = load_executable(tmp, a, istr); - - if (tmp.handle) r[a].push_back(tmp); - } - } - - return HSA_STATUS_SUCCESS; - }, agent); - } - }); - - return r; - } - vector> function_names_for( const elfio& reader, section* symtab) { @@ -467,6 +406,52 @@ namespace namespace hip_impl { + const unordered_map>& executables() + { + static unordered_map> r; + static once_flag f; + + call_once(f, []() { + static const auto accelerators = hc::accelerator::get_all(); + + for (auto&& acc : accelerators) { + auto agent = static_cast(acc.get_hsa_agent()); + + if (!agent) continue; + + hsa_agent_iterate_isas(*agent, [](hsa_isa_t x, void* pa) { + const auto it = code_object_blobs().find(x); + + if (it != code_object_blobs().cend()) { + hsa_agent_t a = *static_cast(pa); + + for (auto&& blob : it->second) { + hsa_executable_t tmp = {}; + + hsa_executable_create_alt( + HSA_PROFILE_FULL, + HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, + nullptr, + &tmp); + + // TODO: this is massively inefficient and only + // meant for illustration. + string blob_to_str{blob.cbegin(), blob.cend()}; + stringstream istr{blob_to_str}; + tmp = load_executable(tmp, a, istr); + + if (tmp.handle) r[a].push_back(tmp); + } + } + + return HSA_STATUS_SUCCESS; + }, agent); + } + }); + + return r; + } + const unordered_map& function_names() { static unordered_map r{