Correctly deal with functions from shared objects, wherein the program visible VA == so_base_va + st_value(function_symbol). Remove quaint usage of pfe for hipMemset (which is actually fill_n).
[ROCm/hip commit: 2cacda91bb]
Este cometimento está contido em:
@@ -1153,42 +1153,56 @@ hipError_t hipMemcpy3D(const struct hipMemcpy3DParms *p)
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
// TODO - make member function of stream?
|
||||
namespace
|
||||
{
|
||||
template<
|
||||
uint32_t block_dim,
|
||||
typename RandomAccessIterator,
|
||||
typename N,
|
||||
typename T>
|
||||
__global__
|
||||
void hip_fill_n(RandomAccessIterator f, N n, T value)
|
||||
{
|
||||
const uint32_t grid_dim = hipGridDim_x;
|
||||
|
||||
size_t idx = hipBlockIdx_x * block_dim + hipThreadIdx_x;
|
||||
while (idx < n) {
|
||||
new (&f[idx]) T{value};
|
||||
idx += grid_dim;
|
||||
}
|
||||
}
|
||||
|
||||
template<
|
||||
typename T,
|
||||
typename std::enable_if<std::is_integral<T>{}>::type* = nullptr>
|
||||
inline
|
||||
const T& clamp_integer(const T& x, const T& lower, const T& upper)
|
||||
{
|
||||
assert(!(upper < lower));
|
||||
|
||||
return std::min(upper, std::max(x, lower));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
ihipMemsetKernel(hipStream_t stream,
|
||||
LockedAccessor_StreamCrit_t &crit,
|
||||
T * ptr, T val, size_t sizeBytes,
|
||||
hc::completion_future *cf)
|
||||
T * ptr, T val, size_t sizeBytes)
|
||||
{
|
||||
int wg = std::min((unsigned)8, stream->getDevice()->_computeUnits);
|
||||
const int threads_per_wg = 256;
|
||||
static constexpr uint32_t block_dim = 256;
|
||||
|
||||
int threads = wg * threads_per_wg;
|
||||
if (threads > sizeBytes) {
|
||||
threads = ((sizeBytes + threads_per_wg - 1) / threads_per_wg) * threads_per_wg;
|
||||
}
|
||||
|
||||
|
||||
hc::extent<1> ext(threads);
|
||||
auto ext_tile = ext.tile(threads_per_wg);
|
||||
|
||||
*cf =
|
||||
hc::parallel_for_each(
|
||||
crit->_av,
|
||||
ext_tile,
|
||||
[=] (hc::tiled_index<1> idx)
|
||||
__attribute__((hc))
|
||||
{
|
||||
int offset = amp_get_global_id(0);
|
||||
// TODO-HCC - change to hc_get_local_size()
|
||||
int stride = amp_get_local_size(0) * hc_get_num_groups(0) ;
|
||||
|
||||
for (int i=offset; i<sizeBytes; i+=stride) {
|
||||
ptr[i] = val;
|
||||
}
|
||||
});
|
||||
const uint32_t grid_dim = clamp_integer<size_t>(
|
||||
sizeBytes / block_dim, 1, UINT32_MAX);
|
||||
|
||||
hipLaunchKernelGGL(
|
||||
hip_fill_n<block_dim>,
|
||||
dim3(grid_dim),
|
||||
dim3{block_dim},
|
||||
0u,
|
||||
stream,
|
||||
ptr,
|
||||
sizeBytes,
|
||||
std::move(val));
|
||||
}
|
||||
|
||||
|
||||
@@ -1202,17 +1216,12 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s
|
||||
stream = ihipSyncAndResolveStream(stream);
|
||||
|
||||
if (stream) {
|
||||
auto crit = stream->lockopen_preKernelCommand();
|
||||
|
||||
|
||||
hc::completion_future cf ;
|
||||
|
||||
if ((sizeBytes & 0x3) == 0) {
|
||||
// use a faster dword-per-workitem copy:
|
||||
try {
|
||||
value = value & 0xff;
|
||||
uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
|
||||
ihipMemsetKernel<uint32_t> (stream, crit, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t), &cf);
|
||||
ihipMemsetKernel<uint32_t> (stream, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t));
|
||||
}
|
||||
catch (std::exception &ex) {
|
||||
e = hipErrorInvalidValue;
|
||||
@@ -1220,19 +1229,16 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s
|
||||
} else {
|
||||
// use a slow byte-per-workitem copy:
|
||||
try {
|
||||
ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes, &cf);
|
||||
ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
|
||||
}
|
||||
catch (std::exception &ex) {
|
||||
e = hipErrorInvalidValue;
|
||||
}
|
||||
}
|
||||
|
||||
stream->lockclose_postKernelCommand("hipMemsetAsync", &crit->_av);
|
||||
|
||||
|
||||
if (HIP_API_BLOCKING) {
|
||||
tprintf (DB_SYNC, "%s LAUNCH_BLOCKING wait for hipMemsetAsync.\n", ToString(stream).c_str());
|
||||
cf.wait();
|
||||
stream->locked_wait();
|
||||
}
|
||||
} else {
|
||||
e = hipErrorInvalidValue;
|
||||
@@ -1253,16 +1259,12 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes)
|
||||
stream = ihipSyncAndResolveStream(stream);
|
||||
|
||||
if (stream) {
|
||||
auto crit = stream->lockopen_preKernelCommand();
|
||||
|
||||
hc::completion_future cf ;
|
||||
|
||||
if ((sizeBytes & 0x3) == 0) {
|
||||
// use a faster dword-per-workitem copy:
|
||||
try {
|
||||
value = value & 0xff;
|
||||
uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
|
||||
ihipMemsetKernel<uint32_t> (stream, crit, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t), &cf);
|
||||
ihipMemsetKernel<uint32_t> (stream, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t));
|
||||
}
|
||||
catch (std::exception &ex) {
|
||||
e = hipErrorInvalidValue;
|
||||
@@ -1270,21 +1272,18 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes)
|
||||
} else {
|
||||
// use a slow byte-per-workitem copy:
|
||||
try {
|
||||
ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes, &cf);
|
||||
ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
|
||||
}
|
||||
catch (std::exception &ex) {
|
||||
e = hipErrorInvalidValue;
|
||||
}
|
||||
}
|
||||
// TODO - is hipMemset supposed to be async?
|
||||
cf.wait();
|
||||
|
||||
stream->lockclose_postKernelCommand("hipMemset", &crit->_av);
|
||||
|
||||
stream->locked_wait();
|
||||
|
||||
if (HIP_LAUNCH_BLOCKING) {
|
||||
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset in %s.\n", __func__, ToString(stream).c_str());
|
||||
cf.wait();
|
||||
stream->locked_wait();
|
||||
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed in %s.\n", __func__, ToString(stream).c_str());
|
||||
}
|
||||
} else {
|
||||
@@ -1305,17 +1304,13 @@ hipError_t hipMemset2D(void* dst, size_t pitch, int value, size_t width, size_t
|
||||
stream = ihipSyncAndResolveStream(stream);
|
||||
|
||||
if (stream) {
|
||||
auto crit = stream->lockopen_preKernelCommand();
|
||||
|
||||
hc::completion_future cf ;
|
||||
|
||||
size_t sizeBytes = pitch * height;
|
||||
if ((sizeBytes & 0x3) == 0) {
|
||||
// use a faster dword-per-workitem copy:
|
||||
try {
|
||||
value = value & 0xff;
|
||||
uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
|
||||
ihipMemsetKernel<uint32_t> (stream, crit, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t), &cf);
|
||||
ihipMemsetKernel<uint32_t> (stream, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t));
|
||||
}
|
||||
catch (std::exception &ex) {
|
||||
e = hipErrorInvalidValue;
|
||||
@@ -1323,20 +1318,18 @@ hipError_t hipMemset2D(void* dst, size_t pitch, int value, size_t width, size_t
|
||||
} else {
|
||||
// use a slow byte-per-workitem copy:
|
||||
try {
|
||||
ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes, &cf);
|
||||
ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
|
||||
}
|
||||
catch (std::exception &ex) {
|
||||
e = hipErrorInvalidValue;
|
||||
}
|
||||
}
|
||||
// TODO - is hipMemset supposed to be async?
|
||||
cf.wait();
|
||||
|
||||
stream->lockclose_postKernelCommand("hipMemset", &crit->_av);
|
||||
stream->locked_wait();
|
||||
|
||||
if (HIP_LAUNCH_BLOCKING) {
|
||||
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset in %s.\n", __func__, ToString(stream).c_str());
|
||||
cf.wait();
|
||||
stream->locked_wait();
|
||||
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed in %s.\n", __func__, ToString(stream).c_str());
|
||||
}
|
||||
} else {
|
||||
@@ -1357,36 +1350,30 @@ hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t sizeByte
|
||||
stream = ihipSyncAndResolveStream(stream);
|
||||
|
||||
if (stream) {
|
||||
auto crit = stream->lockopen_preKernelCommand();
|
||||
|
||||
hc::completion_future cf ;
|
||||
|
||||
if ((sizeBytes & 0x3) == 0) {
|
||||
// use a faster dword-per-workitem copy:
|
||||
try {
|
||||
uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
|
||||
ihipMemsetKernel<uint32_t> (stream, crit, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t), &cf);
|
||||
ihipMemsetKernel<uint32_t> (stream, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t));
|
||||
}
|
||||
catch (std::exception &ex) {
|
||||
std::cout << ex.what() << std::endl;
|
||||
e = hipErrorInvalidValue;
|
||||
}
|
||||
} else {
|
||||
// use a slow byte-per-workitem copy:
|
||||
try {
|
||||
ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes, &cf);
|
||||
ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
|
||||
}
|
||||
catch (std::exception &ex) {
|
||||
e = hipErrorInvalidValue;
|
||||
}
|
||||
}
|
||||
cf.wait();
|
||||
|
||||
stream->lockclose_postKernelCommand("hipMemsetD8", &crit->_av);
|
||||
|
||||
stream->locked_wait();
|
||||
|
||||
if (HIP_LAUNCH_BLOCKING) {
|
||||
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset in %s.\n", __func__, ToString(stream).c_str());
|
||||
cf.wait();
|
||||
stream->locked_wait();
|
||||
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed in %s.\n", __func__, ToString(stream).c_str());
|
||||
}
|
||||
} else {
|
||||
|
||||
@@ -195,9 +195,9 @@ namespace
|
||||
static vector<vector<uint8_t>> blobs{
|
||||
code_object_blob_for_process()};
|
||||
|
||||
dl_iterate_phdr([](dl_phdr_info* i, std::size_t, void*) {
|
||||
dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void*) {
|
||||
elfio tmp;
|
||||
if (tmp.load(i->dlpi_name)) {
|
||||
if (tmp.load(info->dlpi_name)) {
|
||||
const auto it = find_section_if(tmp, [](const section* x) {
|
||||
return x->get_name() == ".kernel";
|
||||
});
|
||||
@@ -269,6 +269,61 @@ namespace
|
||||
return r;
|
||||
}
|
||||
|
||||
vector<pair<uintptr_t, string>> function_names_for(
|
||||
const elfio& reader, section* symtab)
|
||||
{
|
||||
vector<pair<uintptr_t, string>> r;
|
||||
symbol_section_accessor symbols{reader, symtab};
|
||||
|
||||
auto foo = reader.get_entry();
|
||||
|
||||
for (auto i = 0u; i != symbols.get_symbols_num(); ++i) {
|
||||
// TODO: this is boyscout code, caching the temporaries
|
||||
// may be of worth.
|
||||
string name;
|
||||
Elf64_Addr value = 0;
|
||||
Elf_Xword size = 0;
|
||||
Elf_Half sect_idx = 0;
|
||||
uint8_t bind = 0;
|
||||
uint8_t type = 0;
|
||||
uint8_t other = 0;
|
||||
|
||||
symbols.get_symbol(
|
||||
i, name, value, size, bind, type, sect_idx, other);
|
||||
|
||||
if (type == STT_FUNC && sect_idx != SHN_UNDEF && !name.empty()) {
|
||||
r.emplace_back(value, name);
|
||||
}
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
const vector<pair<uintptr_t, string>>& function_names_for_process()
|
||||
{
|
||||
static constexpr const char self[] = "/proc/self/exe";
|
||||
|
||||
static vector<pair<uintptr_t, string>> r;
|
||||
static once_flag f;
|
||||
|
||||
call_once(f, []() {
|
||||
elfio reader;
|
||||
|
||||
if (!reader.load(self)) {
|
||||
throw runtime_error{
|
||||
"Failed to load the ELF file for the current process."};
|
||||
}
|
||||
|
||||
auto symtab = find_section_if(reader, [](const section* x) {
|
||||
return x->get_type() == SHT_SYMTAB;
|
||||
});
|
||||
|
||||
r = function_names_for(reader, symtab);
|
||||
});
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
hsa_agent_t agent(hsa_executable_symbol_t x)
|
||||
{
|
||||
@@ -395,43 +450,32 @@ namespace hip_impl
|
||||
{
|
||||
const unordered_map<uintptr_t, string>& function_names()
|
||||
{
|
||||
static constexpr const char self[] = "/proc/self/exe";
|
||||
|
||||
static unordered_map<uintptr_t, string> r;
|
||||
static unordered_map<uintptr_t, string> r{
|
||||
function_names_for_process().cbegin(),
|
||||
function_names_for_process().cend()};
|
||||
static once_flag f;
|
||||
|
||||
call_once(f, []() {
|
||||
elfio reader;
|
||||
dl_iterate_phdr([](dl_phdr_info* info, size_t, void*) {
|
||||
elfio tmp;
|
||||
if (tmp.load(info->dlpi_name)) {
|
||||
const auto it = find_section_if(tmp, [](const section* x) {
|
||||
return x->get_type() == SHT_SYMTAB;
|
||||
});
|
||||
|
||||
if (!reader.load(self)) {
|
||||
throw runtime_error{
|
||||
"Failed to load the ELF file for the current process."};
|
||||
}
|
||||
if (it) {
|
||||
auto n = function_names_for(tmp, it);
|
||||
|
||||
auto symtab = find_section_if(reader, [](const section* x) {
|
||||
return x->get_type() == SHT_SYMTAB;
|
||||
});
|
||||
for (auto&& f : n) f.first += info->dlpi_addr;
|
||||
|
||||
symbol_section_accessor symbols{reader, symtab};
|
||||
|
||||
for (auto i = 0u; i != symbols.get_symbols_num(); ++i) {
|
||||
// TODO: this is boyscout code, caching the temporaries
|
||||
// may be of worth.
|
||||
string name;
|
||||
Elf64_Addr value = 0;
|
||||
Elf_Xword size = 0;
|
||||
Elf_Half sect_idx = 0;
|
||||
uint8_t bind = 0;
|
||||
uint8_t type = 0;
|
||||
uint8_t other = 0;
|
||||
|
||||
symbols.get_symbol(
|
||||
i, name, value, size, bind, type, sect_idx, other);
|
||||
|
||||
if (type == STT_FUNC && sect_idx != SHN_UNDEF && !name.empty()) {
|
||||
r.emplace(value, name);
|
||||
r.insert(
|
||||
make_move_iterator(n.begin()),
|
||||
make_move_iterator(n.end()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}, nullptr);
|
||||
});
|
||||
|
||||
return r;
|
||||
|
||||
Criar uma nova questão referindo esta
Bloquear um utilizador