2
0

Correctly deal with functions from shared objects, wherein the program visible VA == so_base_va + st_value(function_symbol). Remove quaint usage of pfe for hipMemset (which is actually fill_n).

[ROCm/hip commit: 2cacda91bb]
Este cometimento está contido em:
Alex Voicu
2017-11-01 22:33:13 +00:00
ascendente 70a41e7dac
cometimento dab971370e
2 ficheiros modificados com 136 adições e 105 eliminações
+60 -73
Ver ficheiro
@@ -1153,42 +1153,56 @@ hipError_t hipMemcpy3D(const struct hipMemcpy3DParms *p)
return ihipLogStatus(e);
}
// TODO - make member function of stream?
namespace
{
template<
uint32_t block_dim,
typename RandomAccessIterator,
typename N,
typename T>
__global__
void hip_fill_n(RandomAccessIterator f, N n, T value)
{
const uint32_t grid_dim = hipGridDim_x;
size_t idx = hipBlockIdx_x * block_dim + hipThreadIdx_x;
while (idx < n) {
new (&f[idx]) T{value};
idx += grid_dim;
}
}
template<
typename T,
typename std::enable_if<std::is_integral<T>{}>::type* = nullptr>
inline
const T& clamp_integer(const T& x, const T& lower, const T& upper)
{
assert(!(upper < lower));
return std::min(upper, std::max(x, lower));
}
}
template <typename T>
void
ihipMemsetKernel(hipStream_t stream,
LockedAccessor_StreamCrit_t &crit,
T * ptr, T val, size_t sizeBytes,
hc::completion_future *cf)
T * ptr, T val, size_t sizeBytes)
{
int wg = std::min((unsigned)8, stream->getDevice()->_computeUnits);
const int threads_per_wg = 256;
static constexpr uint32_t block_dim = 256;
int threads = wg * threads_per_wg;
if (threads > sizeBytes) {
threads = ((sizeBytes + threads_per_wg - 1) / threads_per_wg) * threads_per_wg;
}
hc::extent<1> ext(threads);
auto ext_tile = ext.tile(threads_per_wg);
*cf =
hc::parallel_for_each(
crit->_av,
ext_tile,
[=] (hc::tiled_index<1> idx)
__attribute__((hc))
{
int offset = amp_get_global_id(0);
// TODO-HCC - change to hc_get_local_size()
int stride = amp_get_local_size(0) * hc_get_num_groups(0) ;
for (int i=offset; i<sizeBytes; i+=stride) {
ptr[i] = val;
}
});
const uint32_t grid_dim = clamp_integer<size_t>(
sizeBytes / block_dim, 1, UINT32_MAX);
hipLaunchKernelGGL(
hip_fill_n<block_dim>,
dim3(grid_dim),
dim3{block_dim},
0u,
stream,
ptr,
sizeBytes,
std::move(val));
}
@@ -1202,17 +1216,12 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s
stream = ihipSyncAndResolveStream(stream);
if (stream) {
auto crit = stream->lockopen_preKernelCommand();
hc::completion_future cf ;
if ((sizeBytes & 0x3) == 0) {
// use a faster dword-per-workitem copy:
try {
value = value & 0xff;
uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
ihipMemsetKernel<uint32_t> (stream, crit, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t), &cf);
ihipMemsetKernel<uint32_t> (stream, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t));
}
catch (std::exception &ex) {
e = hipErrorInvalidValue;
@@ -1220,19 +1229,16 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s
} else {
// use a slow byte-per-workitem copy:
try {
ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes, &cf);
ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
}
catch (std::exception &ex) {
e = hipErrorInvalidValue;
}
}
stream->lockclose_postKernelCommand("hipMemsetAsync", &crit->_av);
if (HIP_API_BLOCKING) {
tprintf (DB_SYNC, "%s LAUNCH_BLOCKING wait for hipMemsetAsync.\n", ToString(stream).c_str());
cf.wait();
stream->locked_wait();
}
} else {
e = hipErrorInvalidValue;
@@ -1253,16 +1259,12 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes)
stream = ihipSyncAndResolveStream(stream);
if (stream) {
auto crit = stream->lockopen_preKernelCommand();
hc::completion_future cf ;
if ((sizeBytes & 0x3) == 0) {
// use a faster dword-per-workitem copy:
try {
value = value & 0xff;
uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
ihipMemsetKernel<uint32_t> (stream, crit, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t), &cf);
ihipMemsetKernel<uint32_t> (stream, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t));
}
catch (std::exception &ex) {
e = hipErrorInvalidValue;
@@ -1270,21 +1272,18 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes)
} else {
// use a slow byte-per-workitem copy:
try {
ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes, &cf);
ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
}
catch (std::exception &ex) {
e = hipErrorInvalidValue;
}
}
// TODO - is hipMemset supposed to be async?
cf.wait();
stream->lockclose_postKernelCommand("hipMemset", &crit->_av);
stream->locked_wait();
if (HIP_LAUNCH_BLOCKING) {
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset in %s.\n", __func__, ToString(stream).c_str());
cf.wait();
stream->locked_wait();
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed in %s.\n", __func__, ToString(stream).c_str());
}
} else {
@@ -1305,17 +1304,13 @@ hipError_t hipMemset2D(void* dst, size_t pitch, int value, size_t width, size_t
stream = ihipSyncAndResolveStream(stream);
if (stream) {
auto crit = stream->lockopen_preKernelCommand();
hc::completion_future cf ;
size_t sizeBytes = pitch * height;
if ((sizeBytes & 0x3) == 0) {
// use a faster dword-per-workitem copy:
try {
value = value & 0xff;
uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
ihipMemsetKernel<uint32_t> (stream, crit, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t), &cf);
ihipMemsetKernel<uint32_t> (stream, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t));
}
catch (std::exception &ex) {
e = hipErrorInvalidValue;
@@ -1323,20 +1318,18 @@ hipError_t hipMemset2D(void* dst, size_t pitch, int value, size_t width, size_t
} else {
// use a slow byte-per-workitem copy:
try {
ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes, &cf);
ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
}
catch (std::exception &ex) {
e = hipErrorInvalidValue;
}
}
// TODO - is hipMemset supposed to be async?
cf.wait();
stream->lockclose_postKernelCommand("hipMemset", &crit->_av);
stream->locked_wait();
if (HIP_LAUNCH_BLOCKING) {
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset in %s.\n", __func__, ToString(stream).c_str());
cf.wait();
stream->locked_wait();
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed in %s.\n", __func__, ToString(stream).c_str());
}
} else {
@@ -1357,36 +1350,30 @@ hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t sizeByte
stream = ihipSyncAndResolveStream(stream);
if (stream) {
auto crit = stream->lockopen_preKernelCommand();
hc::completion_future cf ;
if ((sizeBytes & 0x3) == 0) {
// use a faster dword-per-workitem copy:
try {
uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
ihipMemsetKernel<uint32_t> (stream, crit, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t), &cf);
ihipMemsetKernel<uint32_t> (stream, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t));
}
catch (std::exception &ex) {
std::cout << ex.what() << std::endl;
e = hipErrorInvalidValue;
}
} else {
// use a slow byte-per-workitem copy:
try {
ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes, &cf);
ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
}
catch (std::exception &ex) {
e = hipErrorInvalidValue;
}
}
cf.wait();
stream->lockclose_postKernelCommand("hipMemsetD8", &crit->_av);
stream->locked_wait();
if (HIP_LAUNCH_BLOCKING) {
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset in %s.\n", __func__, ToString(stream).c_str());
cf.wait();
stream->locked_wait();
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed in %s.\n", __func__, ToString(stream).c_str());
}
} else {
+76 -32
Ver ficheiro
@@ -195,9 +195,9 @@ namespace
static vector<vector<uint8_t>> blobs{
code_object_blob_for_process()};
dl_iterate_phdr([](dl_phdr_info* i, std::size_t, void*) {
dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void*) {
elfio tmp;
if (tmp.load(i->dlpi_name)) {
if (tmp.load(info->dlpi_name)) {
const auto it = find_section_if(tmp, [](const section* x) {
return x->get_name() == ".kernel";
});
@@ -269,6 +269,61 @@ namespace
return r;
}
vector<pair<uintptr_t, string>> function_names_for(
const elfio& reader, section* symtab)
{
vector<pair<uintptr_t, string>> r;
symbol_section_accessor symbols{reader, symtab};
auto foo = reader.get_entry();
for (auto i = 0u; i != symbols.get_symbols_num(); ++i) {
// TODO: this is boyscout code, caching the temporaries
// may be of worth.
string name;
Elf64_Addr value = 0;
Elf_Xword size = 0;
Elf_Half sect_idx = 0;
uint8_t bind = 0;
uint8_t type = 0;
uint8_t other = 0;
symbols.get_symbol(
i, name, value, size, bind, type, sect_idx, other);
if (type == STT_FUNC && sect_idx != SHN_UNDEF && !name.empty()) {
r.emplace_back(value, name);
}
}
return r;
}
const vector<pair<uintptr_t, string>>& function_names_for_process()
{
static constexpr const char self[] = "/proc/self/exe";
static vector<pair<uintptr_t, string>> r;
static once_flag f;
call_once(f, []() {
elfio reader;
if (!reader.load(self)) {
throw runtime_error{
"Failed to load the ELF file for the current process."};
}
auto symtab = find_section_if(reader, [](const section* x) {
return x->get_type() == SHT_SYMTAB;
});
r = function_names_for(reader, symtab);
});
return r;
}
inline
hsa_agent_t agent(hsa_executable_symbol_t x)
{
@@ -395,43 +450,32 @@ namespace hip_impl
{
const unordered_map<uintptr_t, string>& function_names()
{
static constexpr const char self[] = "/proc/self/exe";
static unordered_map<uintptr_t, string> r;
static unordered_map<uintptr_t, string> r{
function_names_for_process().cbegin(),
function_names_for_process().cend()};
static once_flag f;
call_once(f, []() {
elfio reader;
dl_iterate_phdr([](dl_phdr_info* info, size_t, void*) {
elfio tmp;
if (tmp.load(info->dlpi_name)) {
const auto it = find_section_if(tmp, [](const section* x) {
return x->get_type() == SHT_SYMTAB;
});
if (!reader.load(self)) {
throw runtime_error{
"Failed to load the ELF file for the current process."};
}
if (it) {
auto n = function_names_for(tmp, it);
auto symtab = find_section_if(reader, [](const section* x) {
return x->get_type() == SHT_SYMTAB;
});
for (auto&& f : n) f.first += info->dlpi_addr;
symbol_section_accessor symbols{reader, symtab};
for (auto i = 0u; i != symbols.get_symbols_num(); ++i) {
// TODO: this is boyscout code, caching the temporaries
// may be of worth.
string name;
Elf64_Addr value = 0;
Elf_Xword size = 0;
Elf_Half sect_idx = 0;
uint8_t bind = 0;
uint8_t type = 0;
uint8_t other = 0;
symbols.get_symbol(
i, name, value, size, bind, type, sect_idx, other);
if (type == STT_FUNC && sect_idx != SHN_UNDEF && !name.empty()) {
r.emplace(value, name);
r.insert(
make_move_iterator(n.begin()),
make_move_iterator(n.end()));
}
}
}
return 0;
}, nullptr);
});
return r;