From 2cacda91bb2d1a6c868d57d24364106c2642143e Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 1 Nov 2017 22:33:13 +0000 Subject: [PATCH] Correctly deal with functions from shared objects, wherein the program visible VA == so_base_va + st_value(function_symbol). Remove quaint usage of pfe for hipMemset (which is actually fill_n). --- src/hip_memory.cpp | 133 +++++++++++++++++++----------------------- src/program_state.cpp | 108 ++++++++++++++++++++++++---------- 2 files changed, 136 insertions(+), 105 deletions(-) diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index 96fc25c27d..32e0016178 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -1153,42 +1153,56 @@ hipError_t hipMemcpy3D(const struct hipMemcpy3DParms *p) return ihipLogStatus(e); } -// TODO - make member function of stream? +namespace +{ + template< + uint32_t block_dim, + typename RandomAccessIterator, + typename N, + typename T> + __global__ + void hip_fill_n(RandomAccessIterator f, N n, T value) + { + const uint32_t grid_dim = hipGridDim_x; + + size_t idx = hipBlockIdx_x * block_dim + hipThreadIdx_x; + while (idx < n) { + new (&f[idx]) T{value}; + idx += grid_dim; + } + } + + template< + typename T, + typename std::enable_if{}>::type* = nullptr> + inline + const T& clamp_integer(const T& x, const T& lower, const T& upper) + { + assert(!(upper < lower)); + + return std::min(upper, std::max(x, lower)); + } +} + template void ihipMemsetKernel(hipStream_t stream, - LockedAccessor_StreamCrit_t &crit, - T * ptr, T val, size_t sizeBytes, - hc::completion_future *cf) + T * ptr, T val, size_t sizeBytes) { - int wg = std::min((unsigned)8, stream->getDevice()->_computeUnits); - const int threads_per_wg = 256; + static constexpr uint32_t block_dim = 256; - int threads = wg * threads_per_wg; - if (threads > sizeBytes) { - threads = ((sizeBytes + threads_per_wg - 1) / threads_per_wg) * threads_per_wg; - } - - - hc::extent<1> ext(threads); - auto ext_tile = ext.tile(threads_per_wg); - - *cf = - hc::parallel_for_each( - crit->_av, - ext_tile, - [=] (hc::tiled_index<1> idx) - __attribute__((hc)) - { - int offset = amp_get_global_id(0); - // TODO-HCC - change to hc_get_local_size() - int stride = amp_get_local_size(0) * hc_get_num_groups(0) ; - - for (int i=offset; i( + sizeBytes / block_dim, 1, UINT32_MAX); + hipLaunchKernelGGL( + hip_fill_n, + dim3(grid_dim), + dim3{block_dim}, + 0u, + stream, + ptr, + sizeBytes, + std::move(val)); } @@ -1202,17 +1216,12 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s stream = ihipSyncAndResolveStream(stream); if (stream) { - auto crit = stream->lockopen_preKernelCommand(); - - - hc::completion_future cf ; - if ((sizeBytes & 0x3) == 0) { // use a faster dword-per-workitem copy: try { value = value & 0xff; uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ; - ihipMemsetKernel (stream, crit, static_cast (dst), value32, sizeBytes/sizeof(uint32_t), &cf); + ihipMemsetKernel (stream, static_cast (dst), value32, sizeBytes/sizeof(uint32_t)); } catch (std::exception &ex) { e = hipErrorInvalidValue; @@ -1220,19 +1229,16 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s } else { // use a slow byte-per-workitem copy: try { - ihipMemsetKernel (stream, crit, static_cast (dst), value, sizeBytes, &cf); + ihipMemsetKernel (stream, static_cast (dst), value, sizeBytes); } catch (std::exception &ex) { e = hipErrorInvalidValue; } } - stream->lockclose_postKernelCommand("hipMemsetAsync", &crit->_av); - - if (HIP_API_BLOCKING) { tprintf (DB_SYNC, "%s LAUNCH_BLOCKING wait for hipMemsetAsync.\n", ToString(stream).c_str()); - cf.wait(); + stream->locked_wait(); } } else { e = hipErrorInvalidValue; @@ -1253,16 +1259,12 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes) stream = ihipSyncAndResolveStream(stream); if (stream) { - auto crit = stream->lockopen_preKernelCommand(); - - hc::completion_future cf ; - if ((sizeBytes & 0x3) == 0) { // use a faster dword-per-workitem copy: try { value = value & 0xff; uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ; - ihipMemsetKernel (stream, crit, static_cast (dst), value32, sizeBytes/sizeof(uint32_t), &cf); + ihipMemsetKernel (stream, static_cast (dst), value32, sizeBytes/sizeof(uint32_t)); } catch (std::exception &ex) { e = hipErrorInvalidValue; @@ -1270,21 +1272,18 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes) } else { // use a slow byte-per-workitem copy: try { - ihipMemsetKernel (stream, crit, static_cast (dst), value, sizeBytes, &cf); + ihipMemsetKernel (stream, static_cast (dst), value, sizeBytes); } catch (std::exception &ex) { e = hipErrorInvalidValue; } } // TODO - is hipMemset supposed to be async? - cf.wait(); - - stream->lockclose_postKernelCommand("hipMemset", &crit->_av); - + stream->locked_wait(); if (HIP_LAUNCH_BLOCKING) { tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset in %s.\n", __func__, ToString(stream).c_str()); - cf.wait(); + stream->locked_wait(); tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed in %s.\n", __func__, ToString(stream).c_str()); } } else { @@ -1305,17 +1304,13 @@ hipError_t hipMemset2D(void* dst, size_t pitch, int value, size_t width, size_t stream = ihipSyncAndResolveStream(stream); if (stream) { - auto crit = stream->lockopen_preKernelCommand(); - - hc::completion_future cf ; - size_t sizeBytes = pitch * height; if ((sizeBytes & 0x3) == 0) { // use a faster dword-per-workitem copy: try { value = value & 0xff; uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ; - ihipMemsetKernel (stream, crit, static_cast (dst), value32, sizeBytes/sizeof(uint32_t), &cf); + ihipMemsetKernel (stream, static_cast (dst), value32, sizeBytes/sizeof(uint32_t)); } catch (std::exception &ex) { e = hipErrorInvalidValue; @@ -1323,20 +1318,18 @@ hipError_t hipMemset2D(void* dst, size_t pitch, int value, size_t width, size_t } else { // use a slow byte-per-workitem copy: try { - ihipMemsetKernel (stream, crit, static_cast (dst), value, sizeBytes, &cf); + ihipMemsetKernel (stream, static_cast (dst), value, sizeBytes); } catch (std::exception &ex) { e = hipErrorInvalidValue; } } // TODO - is hipMemset supposed to be async? - cf.wait(); - - stream->lockclose_postKernelCommand("hipMemset", &crit->_av); + stream->locked_wait(); if (HIP_LAUNCH_BLOCKING) { tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset in %s.\n", __func__, ToString(stream).c_str()); - cf.wait(); + stream->locked_wait(); tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed in %s.\n", __func__, ToString(stream).c_str()); } } else { @@ -1357,36 +1350,30 @@ hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t sizeByte stream = ihipSyncAndResolveStream(stream); if (stream) { - auto crit = stream->lockopen_preKernelCommand(); - - hc::completion_future cf ; - if ((sizeBytes & 0x3) == 0) { // use a faster dword-per-workitem copy: try { uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ; - ihipMemsetKernel (stream, crit, static_cast (dst), value32, sizeBytes/sizeof(uint32_t), &cf); + ihipMemsetKernel (stream, static_cast (dst), value32, sizeBytes/sizeof(uint32_t)); } catch (std::exception &ex) { + std::cout << ex.what() << std::endl; e = hipErrorInvalidValue; } } else { // use a slow byte-per-workitem copy: try { - ihipMemsetKernel (stream, crit, static_cast (dst), value, sizeBytes, &cf); + ihipMemsetKernel (stream, static_cast (dst), value, sizeBytes); } catch (std::exception &ex) { e = hipErrorInvalidValue; } } - cf.wait(); - - stream->lockclose_postKernelCommand("hipMemsetD8", &crit->_av); - + stream->locked_wait(); if (HIP_LAUNCH_BLOCKING) { tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset in %s.\n", __func__, ToString(stream).c_str()); - cf.wait(); + stream->locked_wait(); tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed in %s.\n", __func__, ToString(stream).c_str()); } } else { diff --git a/src/program_state.cpp b/src/program_state.cpp index be871a6e84..a4f7fdbdbe 100644 --- a/src/program_state.cpp +++ b/src/program_state.cpp @@ -195,9 +195,9 @@ namespace static vector> blobs{ code_object_blob_for_process()}; - dl_iterate_phdr([](dl_phdr_info* i, std::size_t, void*) { + dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void*) { elfio tmp; - if (tmp.load(i->dlpi_name)) { + if (tmp.load(info->dlpi_name)) { const auto it = find_section_if(tmp, [](const section* x) { return x->get_name() == ".kernel"; }); @@ -269,6 +269,61 @@ namespace return r; } + vector> function_names_for( + const elfio& reader, section* symtab) + { + vector> r; + symbol_section_accessor symbols{reader, symtab}; + + auto foo = reader.get_entry(); + + for (auto i = 0u; i != symbols.get_symbols_num(); ++i) { + // TODO: this is boyscout code, caching the temporaries + // may be of worth. + string name; + Elf64_Addr value = 0; + Elf_Xword size = 0; + Elf_Half sect_idx = 0; + uint8_t bind = 0; + uint8_t type = 0; + uint8_t other = 0; + + symbols.get_symbol( + i, name, value, size, bind, type, sect_idx, other); + + if (type == STT_FUNC && sect_idx != SHN_UNDEF && !name.empty()) { + r.emplace_back(value, name); + } + } + + return r; + } + + const vector>& function_names_for_process() + { + static constexpr const char self[] = "/proc/self/exe"; + + static vector> r; + static once_flag f; + + call_once(f, []() { + elfio reader; + + if (!reader.load(self)) { + throw runtime_error{ + "Failed to load the ELF file for the current process."}; + } + + auto symtab = find_section_if(reader, [](const section* x) { + return x->get_type() == SHT_SYMTAB; + }); + + r = function_names_for(reader, symtab); + }); + + return r; + } + inline hsa_agent_t agent(hsa_executable_symbol_t x) { @@ -395,43 +450,32 @@ namespace hip_impl { const unordered_map& function_names() { - static constexpr const char self[] = "/proc/self/exe"; - - static unordered_map r; + static unordered_map r{ + function_names_for_process().cbegin(), + function_names_for_process().cend()}; static once_flag f; call_once(f, []() { - elfio reader; + dl_iterate_phdr([](dl_phdr_info* info, size_t, void*) { + elfio tmp; + if (tmp.load(info->dlpi_name)) { + const auto it = find_section_if(tmp, [](const section* x) { + return x->get_type() == SHT_SYMTAB; + }); - if (!reader.load(self)) { - throw runtime_error{ - "Failed to load the ELF file for the current process."}; - } + if (it) { + auto n = function_names_for(tmp, it); - auto symtab = find_section_if(reader, [](const section* x) { - return x->get_type() == SHT_SYMTAB; - }); + for (auto&& f : n) f.first += info->dlpi_addr; - symbol_section_accessor symbols{reader, symtab}; - - for (auto i = 0u; i != symbols.get_symbols_num(); ++i) { - // TODO: this is boyscout code, caching the temporaries - // may be of worth. - string name; - Elf64_Addr value = 0; - Elf_Xword size = 0; - Elf_Half sect_idx = 0; - uint8_t bind = 0; - uint8_t type = 0; - uint8_t other = 0; - - symbols.get_symbol( - i, name, value, size, bind, type, sect_idx, other); - - if (type == STT_FUNC && sect_idx != SHN_UNDEF && !name.empty()) { - r.emplace(value, name); + r.insert( + make_move_iterator(n.begin()), + make_move_iterator(n.end())); + } } - } + + return 0; + }, nullptr); }); return r;