Refactor the __device__ versions of memset and memcpy to be less awkward i.e. not return nullptr as opposed to the destination pointer (it can only be assumed it was done for maximum confusion) and actually unroll as they claim to. Change all of the {to, from}Symbol functions to use hipModuleGetGlobal, as opposed to hc::accelerator::get_symbol_address which is no longer valid with module based dispatch.
此提交包含在:
@@ -35,6 +35,24 @@ THE SOFTWARE.
|
||||
struct ihipModuleSymbol_t;
|
||||
using hipFunction_t = ihipModuleSymbol_t*;
|
||||
|
||||
namespace std
|
||||
{
|
||||
template<>
|
||||
struct hash<hsa_agent_t> {
|
||||
size_t operator()(hsa_agent_t x) const
|
||||
{
|
||||
return hash<decltype(x.handle)>{}(x.handle);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
inline
|
||||
constexpr
|
||||
bool operator==(hsa_agent_t x, hsa_agent_t y)
|
||||
{
|
||||
return x.handle == y.handle;
|
||||
}
|
||||
|
||||
namespace hip_impl
|
||||
{
|
||||
struct Kernel_descriptor {
|
||||
@@ -50,6 +68,8 @@ namespace hip_impl
|
||||
}
|
||||
};
|
||||
|
||||
const std::unordered_map<
|
||||
hsa_agent_t, std::vector<hsa_executable_t>>& executables();
|
||||
const std::unordered_map<
|
||||
std::uintptr_t,
|
||||
std::vector<std::pair<hsa_agent_t, Kernel_descriptor>>>& functions();
|
||||
|
||||
+37
-12
@@ -102,23 +102,48 @@ __device__ void* __hip_hc_free(void *ptr)
|
||||
// loop unrolling
|
||||
__device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size)
|
||||
{
|
||||
uint8_t *dstPtr, *srcPtr;
|
||||
dstPtr = (uint8_t*)dst;
|
||||
srcPtr = (uint8_t*)src;
|
||||
for(uint32_t i=0;i<size;i++) {
|
||||
dstPtr[i] = srcPtr[i];
|
||||
auto dstPtr = static_cast<uint8_t*>(dst);
|
||||
auto srcPtr = static_cast<const uint8_t*>(src);
|
||||
|
||||
while (size >= 4u) {
|
||||
dstPtr[0] = srcPtr[0];
|
||||
dstPtr[1] = srcPtr[1];
|
||||
dstPtr[2] = srcPtr[2];
|
||||
dstPtr[3] = srcPtr[3];
|
||||
|
||||
size -= 4u;
|
||||
srcPtr += 4u;
|
||||
dstPtr += 4u;
|
||||
}
|
||||
return nullptr;
|
||||
switch (size) {
|
||||
case 3: dstPtr[2] = srcPtr[2];
|
||||
case 2: dstPtr[1] = srcPtr[1];
|
||||
case 1: dstPtr[0] = srcPtr[0];
|
||||
}
|
||||
|
||||
return dst;
|
||||
}
|
||||
|
||||
__device__ void* __hip_hc_memset(void* ptr, uint8_t val, size_t size)
|
||||
__device__ void* __hip_hc_memset(void* dst, uint8_t val, size_t size)
|
||||
{
|
||||
uint8_t *dstPtr;
|
||||
dstPtr = (uint8_t*)ptr;
|
||||
for(uint32_t i=0;i<size;i++) {
|
||||
dstPtr[i] = val;
|
||||
auto dstPtr = static_cast<uint8_t*>(dst);
|
||||
|
||||
while (size >= 4u) {
|
||||
dstPtr[0] = val;
|
||||
dstPtr[1] = val;
|
||||
dstPtr[2] = val;
|
||||
dstPtr[3] = val;
|
||||
|
||||
size -= 4u;
|
||||
dstPtr += 4u;
|
||||
}
|
||||
return nullptr;
|
||||
switch (size) {
|
||||
case 3: dstPtr[2] = val;
|
||||
case 2: dstPtr[1] = val;
|
||||
case 1: dstPtr[0] = val;
|
||||
}
|
||||
|
||||
return dst;
|
||||
}
|
||||
|
||||
__device__ float __hip_erfinvf(float x){
|
||||
|
||||
+18
-6
@@ -715,7 +715,10 @@ hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t cou
|
||||
|
||||
hc::accelerator acc = ctx->getDevice()->_acc;
|
||||
|
||||
void *dst = acc.get_symbol_address((const char*) symbolName);
|
||||
hipDeviceptr_t dst = nullptr;
|
||||
size_t byte_cnt = 0u;
|
||||
auto status = hipModuleGetGlobal(
|
||||
&dst, &byte_cnt, 0, static_cast<const char*>(symbolName));
|
||||
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst);
|
||||
|
||||
if(dst == nullptr)
|
||||
@@ -750,7 +753,10 @@ hipError_t hipMemcpyFromSymbol(void* dst, const void* symbolName, size_t count,
|
||||
|
||||
hc::accelerator acc = ctx->getDevice()->_acc;
|
||||
|
||||
void *src = acc.get_symbol_address((const char*) symbolName);
|
||||
hipDeviceptr_t src = nullptr;
|
||||
size_t byte_cnt = 0u;
|
||||
auto status = hipModuleGetGlobal(
|
||||
&src, &byte_cnt, 0, static_cast<const char*>(symbolName));
|
||||
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst);
|
||||
|
||||
if(dst == nullptr)
|
||||
@@ -787,7 +793,10 @@ hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void *src, size_
|
||||
|
||||
hc::accelerator acc = ctx->getDevice()->_acc;
|
||||
|
||||
void *dst = acc.get_symbol_address((const char*) symbolName);
|
||||
hipDeviceptr_t dst = nullptr;
|
||||
size_t byte_cnt = 0u;
|
||||
auto status = hipModuleGetGlobal(
|
||||
&dst, &byte_cnt, 0, static_cast<const char*>(symbolName));
|
||||
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst);
|
||||
|
||||
if(dst == nullptr)
|
||||
@@ -825,7 +834,10 @@ hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName, size_t co
|
||||
|
||||
hc::accelerator acc = ctx->getDevice()->_acc;
|
||||
|
||||
void *src = acc.get_symbol_address((const char*) symbolName);
|
||||
hipDeviceptr_t src = nullptr;
|
||||
size_t byte_cnt = 0u;
|
||||
auto status = hipModuleGetGlobal(
|
||||
&src, &byte_cnt, 0, static_cast<const char*>(symbolName));
|
||||
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, src);
|
||||
|
||||
if(src == nullptr || dst == nullptr)
|
||||
@@ -1171,9 +1183,9 @@ namespace
|
||||
__global__
|
||||
void hip_fill_n(RandomAccessIterator f, N n, T value)
|
||||
{
|
||||
const uint32_t grid_dim = hipGridDim_x;
|
||||
const uint32_t grid_dim = gridDim.x * blockDim.x;
|
||||
|
||||
size_t idx = hipBlockIdx_x * block_dim + hipThreadIdx_x;
|
||||
size_t idx = blockIdx.x * block_dim + threadIdx.x;
|
||||
while (idx < n) {
|
||||
new (&f[idx]) T{value};
|
||||
idx += grid_dim;
|
||||
|
||||
+85
-34
@@ -554,16 +554,93 @@ namespace
|
||||
}
|
||||
|
||||
inline
|
||||
std::vector<Agent_global> read_agent_globals(hipModule_t hmodule)
|
||||
std::vector<Agent_global> read_agent_globals(
|
||||
hsa_agent_t agent, hsa_executable_t executable)
|
||||
{
|
||||
std::vector<Agent_global> r;
|
||||
|
||||
|
||||
hsa_executable_iterate_agent_symbols(
|
||||
hmodule->executable, this_agent(), copy_agent_global_variables, &r);
|
||||
executable, agent, copy_agent_global_variables, &r);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
template<typename ForwardIterator>
|
||||
std::pair<hipDeviceptr_t, std::size_t> read_global_description(
|
||||
ForwardIterator f, ForwardIterator l, const char* name)
|
||||
{
|
||||
const auto it = std::find_if(
|
||||
f, l, [=](const Agent_global& x) { return x.name == name; });
|
||||
|
||||
return it == l ?
|
||||
std::make_pair(nullptr, 0u) :
|
||||
std::make_pair(it->address, it->byte_cnt);
|
||||
}
|
||||
|
||||
hipError_t read_agent_global_from_module(
|
||||
hipDeviceptr_t *dptr,
|
||||
size_t* bytes,
|
||||
hipModule_t hmod,
|
||||
const char* name)
|
||||
{
|
||||
static std::unordered_map<
|
||||
hipModule_t, std::vector<Agent_global>> agent_globals;
|
||||
|
||||
// TODO: this is not particularly robust.
|
||||
if (agent_globals.count(hmod) == 0) {
|
||||
static std::mutex mtx;
|
||||
std::lock_guard<std::mutex> lck{mtx};
|
||||
|
||||
if (agent_globals.count(hmod) == 0) {
|
||||
agent_globals.emplace(
|
||||
hmod, read_agent_globals(this_agent(), hmod->executable));
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: This is unsafe iff some other emplacement triggers rehashing.
|
||||
// It will have to be properly fleshed out in the future.
|
||||
const auto it0 = agent_globals.find(hmod);
|
||||
if (it0 == agent_globals.cend()) {
|
||||
throw std::runtime_error{"agent_globals data structure corrupted."};
|
||||
}
|
||||
|
||||
std::tie(*dptr, *bytes) = read_global_description(
|
||||
it0->second.cbegin(), it0->second.cend(), name);
|
||||
|
||||
return dptr ? hipSuccess : hipErrorNotFound;
|
||||
}
|
||||
|
||||
hipError_t read_agent_global_from_process(
|
||||
hipDeviceptr_t *dptr, size_t* bytes, const char* name)
|
||||
{
|
||||
static std::unordered_map<
|
||||
hsa_agent_t, std::vector<Agent_global>> agent_globals;
|
||||
static std::once_flag f;
|
||||
|
||||
std::call_once(f, []() {
|
||||
for (auto&& agent_executables : hip_impl::executables()) {
|
||||
std::vector<Agent_global> tmp0;
|
||||
for (auto&& executable : agent_executables.second) {
|
||||
auto tmp1 = read_agent_globals(
|
||||
agent_executables.first, executable);
|
||||
tmp0.insert(
|
||||
tmp0.end(),
|
||||
std::make_move_iterator(tmp1.begin()),
|
||||
std::make_move_iterator(tmp1.end()));
|
||||
}
|
||||
agent_globals.emplace(agent_executables.first, std::move(tmp0));
|
||||
}
|
||||
});
|
||||
|
||||
const auto it = agent_globals.find(this_agent());
|
||||
|
||||
if (it == agent_globals.cend()) return hipErrorNotInitialized;
|
||||
|
||||
std::tie(*dptr, *bytes) = read_global_description(
|
||||
it->second.cbegin(), it->second.cend(), name);
|
||||
|
||||
return dptr ? hipSuccess : hipErrorNotFound;
|
||||
}
|
||||
}
|
||||
|
||||
hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes,
|
||||
@@ -574,41 +651,15 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes,
|
||||
if(dptr == NULL || bytes == NULL){
|
||||
return ihipLogStatus(hipErrorInvalidValue);
|
||||
}
|
||||
if(name == NULL || hmod == NULL){
|
||||
if(name == NULL){
|
||||
return ihipLogStatus(hipErrorNotInitialized);
|
||||
}
|
||||
else{
|
||||
static std::unordered_map<
|
||||
hipModule_t, std::vector<Agent_global>> agent_globals;
|
||||
ret = hmod ?
|
||||
read_agent_global_from_module(dptr, bytes, hmod, name) :
|
||||
read_agent_global_from_process(dptr, bytes, name);
|
||||
|
||||
// TODO: this is not particularly robust.
|
||||
if (agent_globals.count(hmod) == 0) {
|
||||
static std::mutex mtx;
|
||||
std::lock_guard<std::mutex> lck{mtx};
|
||||
|
||||
if (agent_globals.count(hmod) == 0) {
|
||||
agent_globals.emplace(hmod, read_agent_globals(hmod));
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: This is unsafe iff some other emplacement triggers rehashing.
|
||||
// It will have to be properly fleshed out in the future.
|
||||
const auto it0 = agent_globals.find(hmod);
|
||||
if (it0 == agent_globals.cend()) {
|
||||
throw std::runtime_error{"agent_globals data structure corrupted."};
|
||||
}
|
||||
|
||||
const auto it1 = std::find_if(
|
||||
it0->second.cbegin(),
|
||||
it0->second.cend(),
|
||||
[=](const Agent_global& x) { return x.name == name; });
|
||||
|
||||
if (it1 == it0->second.cend()) return ihipLogStatus(hipErrorNotFound);
|
||||
|
||||
*dptr = it1->address;
|
||||
*bytes = it1->byte_cnt;
|
||||
|
||||
return ihipLogStatus(hipSuccess);
|
||||
return ihipLogStatus(ret);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
+46
-61
@@ -31,14 +31,6 @@ using namespace std;
|
||||
|
||||
namespace std
|
||||
{
|
||||
template<>
|
||||
struct hash<hsa_agent_t> {
|
||||
size_t operator()(hsa_agent_t x) const
|
||||
{
|
||||
return hash<decltype(x.handle)>{}(x.handle);
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct hash<hsa_isa_t> {
|
||||
size_t operator()(hsa_isa_t x) const
|
||||
@@ -48,13 +40,6 @@ namespace std
|
||||
};
|
||||
}
|
||||
|
||||
inline
|
||||
constexpr
|
||||
bool operator==(hsa_agent_t x, hsa_agent_t y)
|
||||
{
|
||||
return x.handle == y.handle;
|
||||
}
|
||||
|
||||
inline
|
||||
constexpr
|
||||
bool operator==(hsa_isa_t x, hsa_isa_t y)
|
||||
@@ -242,52 +227,6 @@ namespace
|
||||
return r;
|
||||
}
|
||||
|
||||
const unordered_map<hsa_agent_t, vector<hsa_executable_t>>& executables()
|
||||
{
|
||||
static unordered_map<hsa_agent_t, vector<hsa_executable_t>> r;
|
||||
static once_flag f;
|
||||
|
||||
call_once(f, []() {
|
||||
static const auto accelerators = hc::accelerator::get_all();
|
||||
|
||||
for (auto&& acc : accelerators) {
|
||||
auto agent = static_cast<hsa_agent_t*>(acc.get_hsa_agent());
|
||||
|
||||
if (!agent) continue;
|
||||
|
||||
hsa_agent_iterate_isas(*agent, [](hsa_isa_t x, void* pa) {
|
||||
const auto it = code_object_blobs().find(x);
|
||||
|
||||
if (it != code_object_blobs().cend()) {
|
||||
hsa_agent_t a = *static_cast<hsa_agent_t*>(pa);
|
||||
|
||||
for (auto&& blob : it->second) {
|
||||
hsa_executable_t tmp = {};
|
||||
|
||||
hsa_executable_create_alt(
|
||||
HSA_PROFILE_FULL,
|
||||
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
|
||||
nullptr,
|
||||
&tmp);
|
||||
|
||||
// TODO: this is massively inefficient and only
|
||||
// meant for illustration.
|
||||
string blob_to_str{blob.cbegin(), blob.cend()};
|
||||
stringstream istr{blob_to_str};
|
||||
tmp = load_executable(tmp, a, istr);
|
||||
|
||||
if (tmp.handle) r[a].push_back(tmp);
|
||||
}
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}, agent);
|
||||
}
|
||||
});
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
vector<pair<uintptr_t, string>> function_names_for(
|
||||
const elfio& reader, section* symtab)
|
||||
{
|
||||
@@ -467,6 +406,52 @@ namespace
|
||||
|
||||
namespace hip_impl
|
||||
{
|
||||
const unordered_map<hsa_agent_t, vector<hsa_executable_t>>& executables()
|
||||
{
|
||||
static unordered_map<hsa_agent_t, vector<hsa_executable_t>> r;
|
||||
static once_flag f;
|
||||
|
||||
call_once(f, []() {
|
||||
static const auto accelerators = hc::accelerator::get_all();
|
||||
|
||||
for (auto&& acc : accelerators) {
|
||||
auto agent = static_cast<hsa_agent_t*>(acc.get_hsa_agent());
|
||||
|
||||
if (!agent) continue;
|
||||
|
||||
hsa_agent_iterate_isas(*agent, [](hsa_isa_t x, void* pa) {
|
||||
const auto it = code_object_blobs().find(x);
|
||||
|
||||
if (it != code_object_blobs().cend()) {
|
||||
hsa_agent_t a = *static_cast<hsa_agent_t*>(pa);
|
||||
|
||||
for (auto&& blob : it->second) {
|
||||
hsa_executable_t tmp = {};
|
||||
|
||||
hsa_executable_create_alt(
|
||||
HSA_PROFILE_FULL,
|
||||
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
|
||||
nullptr,
|
||||
&tmp);
|
||||
|
||||
// TODO: this is massively inefficient and only
|
||||
// meant for illustration.
|
||||
string blob_to_str{blob.cbegin(), blob.cend()};
|
||||
stringstream istr{blob_to_str};
|
||||
tmp = load_executable(tmp, a, istr);
|
||||
|
||||
if (tmp.handle) r[a].push_back(tmp);
|
||||
}
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}, agent);
|
||||
}
|
||||
});
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
const unordered_map<uintptr_t, string>& function_names()
|
||||
{
|
||||
static unordered_map<uintptr_t, string> r{
|
||||
|
||||
新增問題並參考
封鎖使用者