e9a6f2c3e6
Since access may only be manipulated on whole pages, suballocator fragments must cooperate to set the page's access. Since the KFD does not migrate memory on access changes this implementation makes agent access sticky across the requests in a fragmented page. Change-Id: I88479ed45fb40e9782b704526a7b8ffb22e7bd76
1367 行
46 KiB
C++
1367 行
46 KiB
C++
////////////////////////////////////////////////////////////////////////////////
|
|
//
|
|
// The University of Illinois/NCSA
|
|
// Open Source License (NCSA)
|
|
//
|
|
// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
|
|
//
|
|
// Developed by:
|
|
//
|
|
// AMD Research and AMD HSA Software Development
|
|
//
|
|
// Advanced Micro Devices, Inc.
|
|
//
|
|
// www.amd.com
|
|
//
|
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
// of this software and associated documentation files (the "Software"), to
|
|
// deal with the Software without restriction, including without limitation
|
|
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
// and/or sell copies of the Software, and to permit persons to whom the
|
|
// Software is furnished to do so, subject to the following conditions:
|
|
//
|
|
// - Redistributions of source code must retain the above copyright notice,
|
|
// this list of conditions and the following disclaimers.
|
|
// - Redistributions in binary form must reproduce the above copyright
|
|
// notice, this list of conditions and the following disclaimers in
|
|
// the documentation and/or other materials provided with the distribution.
|
|
// - Neither the names of Advanced Micro Devices, Inc,
|
|
// nor the names of its contributors may be used to endorse or promote
|
|
// products derived from this Software without specific prior written
|
|
// permission.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
|
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
|
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
// DEALINGS WITH THE SOFTWARE.
|
|
//
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
#include "core/inc/runtime.h"
|
|
|
|
#include <algorithm>
|
|
#include <atomic>
|
|
#include <cstring>
|
|
#include <string>
|
|
#include <thread>
|
|
#include <vector>
|
|
|
|
#include "core/common/shared.h"
|
|
|
|
#include "core/inc/hsa_ext_interface.h"
|
|
#include "core/inc/amd_cpu_agent.h"
|
|
#include "core/inc/amd_gpu_agent.h"
|
|
#include "core/inc/amd_memory_region.h"
|
|
#include "core/inc/amd_topology.h"
|
|
#include "core/inc/signal.h"
|
|
#include "core/inc/interrupt_signal.h"
|
|
#include "core/inc/hsa_ext_amd_impl.h"
|
|
|
|
#include "core/inc/hsa_api_trace_int.h"
|
|
|
|
#define HSA_VERSION_MAJOR 1
|
|
#define HSA_VERSION_MINOR 1
|
|
|
|
const char rocrbuildid[] __attribute__((used)) = "ROCR BUILD ID: " STRING(ROCR_BUILD_ID);
|
|
|
|
namespace core {
|
|
bool g_use_interrupt_wait = true;
|
|
|
|
Runtime* Runtime::runtime_singleton_ = NULL;
|
|
|
|
KernelMutex Runtime::bootstrap_lock_;
|
|
|
|
static bool loaded = true;
|
|
|
|
class RuntimeCleanup {
|
|
public:
|
|
~RuntimeCleanup() {
|
|
if (!Runtime::IsOpen()) {
|
|
delete Runtime::runtime_singleton_;
|
|
}
|
|
|
|
loaded = false;
|
|
}
|
|
};
|
|
|
|
static RuntimeCleanup cleanup_at_unload_;
|
|
|
|
hsa_status_t Runtime::Acquire() {
|
|
// Check to see if HSA has been cleaned up (process exit)
|
|
if (!loaded) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
|
|
|
// Handle initialization races
|
|
ScopedAcquire<KernelMutex> boot(&bootstrap_lock_);
|
|
|
|
if (runtime_singleton_ == NULL) {
|
|
runtime_singleton_ = new Runtime();
|
|
}
|
|
|
|
// Serialize with release
|
|
ScopedAcquire<KernelMutex> lock(&runtime_singleton_->kernel_lock_);
|
|
|
|
if (runtime_singleton_->ref_count_ == INT32_MAX) {
|
|
return HSA_STATUS_ERROR_REFCOUNT_OVERFLOW;
|
|
}
|
|
|
|
runtime_singleton_->ref_count_++;
|
|
|
|
if (runtime_singleton_->ref_count_ == 1) {
|
|
hsa_status_t status = runtime_singleton_->Load();
|
|
|
|
if (status != HSA_STATUS_SUCCESS) {
|
|
runtime_singleton_->ref_count_--;
|
|
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
|
}
|
|
}
|
|
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
hsa_status_t Runtime::Release() {
|
|
ScopedAcquire<KernelMutex> lock(&kernel_lock_);
|
|
if (ref_count_ == 0) {
|
|
return HSA_STATUS_ERROR_NOT_INITIALIZED;
|
|
}
|
|
|
|
if (ref_count_ == 1) {
|
|
// Release all registered memory, then unload backends
|
|
Unload();
|
|
}
|
|
|
|
ref_count_--;
|
|
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
bool Runtime::IsOpen() {
|
|
return (Runtime::runtime_singleton_ != NULL) &&
|
|
(Runtime::runtime_singleton_->ref_count_ != 0);
|
|
}
|
|
|
|
void Runtime::RegisterAgent(Agent* agent) {
|
|
// Record the agent in the node-to-agent reverse lookup table.
|
|
agents_by_node_[agent->node_id()].push_back(agent);
|
|
|
|
// Process agent as a cpu or gpu device.
|
|
if (agent->device_type() == Agent::DeviceType::kAmdCpuDevice) {
|
|
cpu_agents_.push_back(agent);
|
|
|
|
// Add cpu regions to the system region list.
|
|
for (const core::MemoryRegion* region : agent->regions()) {
|
|
if (region->fine_grain()) {
|
|
system_regions_fine_.push_back(region);
|
|
} else {
|
|
system_regions_coarse_.push_back(region);
|
|
}
|
|
}
|
|
|
|
assert(system_regions_fine_.size() > 0);
|
|
|
|
// Init default fine grain system region allocator using fine grain
|
|
// system region of the first discovered CPU agent.
|
|
if (cpu_agents_.size() == 1) {
|
|
// Might need memory pooling to cover allocation that
|
|
// requires less than 4096 bytes.
|
|
system_allocator_ =
|
|
[&](size_t size, size_t alignment,
|
|
MemoryRegion::AllocateFlags alloc_flags) -> void* {
|
|
assert(alignment <= 4096);
|
|
void* ptr = NULL;
|
|
return (HSA_STATUS_SUCCESS ==
|
|
core::Runtime::runtime_singleton_->AllocateMemory(
|
|
system_regions_fine_[0], size, alloc_flags, &ptr))
|
|
? ptr
|
|
: NULL;
|
|
};
|
|
|
|
system_deallocator_ =
|
|
[](void* ptr) { core::Runtime::runtime_singleton_->FreeMemory(ptr); };
|
|
|
|
BaseShared::SetAllocateAndFree(system_allocator_, system_deallocator_);
|
|
}
|
|
|
|
// Setup system clock frequency for the first time.
|
|
if (sys_clock_freq_ == 0) {
|
|
// Cache system clock frequency
|
|
HsaClockCounters clocks;
|
|
hsaKmtGetClockCounters(0, &clocks);
|
|
sys_clock_freq_ = clocks.SystemClockFrequencyHz;
|
|
}
|
|
} else if (agent->device_type() == Agent::DeviceType::kAmdGpuDevice) {
|
|
gpu_agents_.push_back(agent);
|
|
|
|
gpu_ids_.push_back(agent->node_id());
|
|
|
|
// Assign the first discovered gpu agent as blit agent that will provide
|
|
// DMA operation for hsa_memory_copy.
|
|
if (blit_agent_ == NULL) {
|
|
blit_agent_ = agent;
|
|
|
|
// Query the start and end address of the SVM address space in this
|
|
// platform.
|
|
if (reinterpret_cast<amd::GpuAgentInt*>(blit_agent_)->profile() ==
|
|
HSA_PROFILE_BASE) {
|
|
std::vector<const core::MemoryRegion*>::const_iterator it =
|
|
std::find_if(blit_agent_->regions().begin(),
|
|
blit_agent_->regions().end(),
|
|
[](const core::MemoryRegion* region) {
|
|
return (
|
|
reinterpret_cast<const amd::MemoryRegion*>(region)->IsSvm());
|
|
});
|
|
|
|
assert(it != blit_agent_->regions().end());
|
|
|
|
const amd::MemoryRegion* svm_region =
|
|
reinterpret_cast<const amd::MemoryRegion*>(*it);
|
|
|
|
start_svm_address_ =
|
|
static_cast<uintptr_t>(svm_region->GetBaseAddress());
|
|
end_svm_address_ = start_svm_address_ + svm_region->GetPhysicalSize();
|
|
|
|
// Bind VM fault handler when we detect the first GPU agent.
|
|
// TODO: validate if it works on APU.
|
|
BindVmFaultHandler();
|
|
} else {
|
|
start_svm_address_ = 0;
|
|
end_svm_address_ = os::GetUserModeVirtualMemoryBase() +
|
|
os::GetUserModeVirtualMemorySize();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void Runtime::DestroyAgents() {
|
|
agents_by_node_.clear();
|
|
|
|
std::for_each(gpu_agents_.begin(), gpu_agents_.end(), DeleteObject());
|
|
gpu_agents_.clear();
|
|
|
|
gpu_ids_.clear();
|
|
|
|
std::for_each(cpu_agents_.begin(), cpu_agents_.end(), DeleteObject());
|
|
cpu_agents_.clear();
|
|
|
|
blit_agent_ = NULL;
|
|
|
|
system_regions_fine_.clear();
|
|
system_regions_coarse_.clear();
|
|
}
|
|
|
|
void Runtime::SetLinkCount(size_t num_link) {
|
|
const size_t last_index = GetIndexLinkInfo(0, num_link);
|
|
link_matrix_.resize(last_index);
|
|
|
|
memset(&link_matrix_[0], 0,
|
|
link_matrix_.size() * sizeof(hsa_amd_memory_pool_link_info_t));
|
|
}
|
|
|
|
void Runtime::RegisterLinkInfo(uint32_t node_id_from, uint32_t node_id_to,
|
|
uint32_t num_hop,
|
|
hsa_amd_memory_pool_link_info_t& link_info) {
|
|
const uint32_t idx = GetIndexLinkInfo(node_id_from, node_id_to);
|
|
link_matrix_[idx].num_hop = num_hop;
|
|
link_matrix_[idx].info = link_info;
|
|
|
|
// Limit the number of hop to 1 since the runtime does not have enough
|
|
// information to share to the user about each hop.
|
|
link_matrix_[idx].num_hop = std::min(link_matrix_[idx].num_hop , 1U);
|
|
}
|
|
|
|
const Runtime::LinkInfo Runtime::GetLinkInfo(uint32_t node_id_from,
|
|
uint32_t node_id_to) {
|
|
return (node_id_from != node_id_to)
|
|
? link_matrix_[GetIndexLinkInfo(node_id_from, node_id_to)]
|
|
: LinkInfo(); // No link.
|
|
}
|
|
|
|
uint32_t Runtime::GetIndexLinkInfo(uint32_t node_id_from, uint32_t node_id_to) {
|
|
const uint32_t node_id_max = std::max(node_id_from, node_id_to) - 1;
|
|
const uint32_t node_id_min = std::min(node_id_from, node_id_to);
|
|
return ((node_id_max * (node_id_max + 1) / 2) + node_id_min);
|
|
}
|
|
|
|
hsa_status_t Runtime::IterateAgent(hsa_status_t (*callback)(hsa_agent_t agent,
|
|
void* data),
|
|
void* data) {
|
|
if (!IsOpen()) {
|
|
return HSA_STATUS_ERROR_NOT_INITIALIZED;
|
|
}
|
|
|
|
std::vector<core::Agent*>* agent_lists[2] = {&cpu_agents_, &gpu_agents_};
|
|
for (std::vector<core::Agent*>* agent_list : agent_lists) {
|
|
for (size_t i = 0; i < agent_list->size(); ++i) {
|
|
hsa_agent_t agent = Agent::Convert(agent_list->at(i));
|
|
hsa_status_t status = callback(agent, data);
|
|
|
|
if (status != HSA_STATUS_SUCCESS) {
|
|
return status;
|
|
}
|
|
}
|
|
}
|
|
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
hsa_status_t Runtime::AllocateMemory(const MemoryRegion* region, size_t size,
|
|
MemoryRegion::AllocateFlags alloc_flags,
|
|
void** address) {
|
|
ScopedAcquire<KernelMutex> lock(&memory_lock_);
|
|
hsa_status_t status = region->Allocate(size, alloc_flags, address);
|
|
|
|
// Track the allocation result so that it could be freed properly.
|
|
if (status == HSA_STATUS_SUCCESS) {
|
|
allocation_map_[*address] = AllocationRegion(region, size);
|
|
}
|
|
|
|
return status;
|
|
}
|
|
|
|
hsa_status_t Runtime::FreeMemory(void* ptr) {
|
|
if (ptr == nullptr) {
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
const MemoryRegion* region = nullptr;
|
|
size_t size = 0;
|
|
ScopedAcquire<KernelMutex> lock(&memory_lock_);
|
|
|
|
std::map<const void*, AllocationRegion>::const_iterator it = allocation_map_.find(ptr);
|
|
|
|
if (it == allocation_map_.end()) {
|
|
assert(false && "Can't find address in allocation map");
|
|
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
|
}
|
|
region = it->second.region;
|
|
size = it->second.size;
|
|
|
|
// Imported fragments can't be released with FreeMemory.
|
|
if (region == nullptr) {
|
|
assert(false && "Can't release imported memory with free.");
|
|
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
|
}
|
|
|
|
allocation_map_.erase(it);
|
|
|
|
return region->Free(ptr, size);
|
|
}
|
|
|
|
hsa_status_t Runtime::CopyMemory(void* dst, const void* src, size_t size) {
|
|
assert(dst != NULL && src != NULL && size != 0);
|
|
|
|
bool is_src_system = false;
|
|
bool is_dst_system = false;
|
|
const uintptr_t src_uptr = reinterpret_cast<uintptr_t>(src);
|
|
const uintptr_t dst_uptr = reinterpret_cast<uintptr_t>(dst);
|
|
|
|
if ((reinterpret_cast<amd::GpuAgentInt*>(blit_agent_)->profile() ==
|
|
HSA_PROFILE_FULL)) {
|
|
is_src_system = (src_uptr < end_svm_address_);
|
|
is_dst_system = (dst_uptr < end_svm_address_);
|
|
} else {
|
|
is_src_system =
|
|
((src_uptr < start_svm_address_) || (src_uptr >= end_svm_address_));
|
|
is_dst_system =
|
|
((dst_uptr < start_svm_address_) || (dst_uptr >= end_svm_address_));
|
|
|
|
if ((is_src_system && !is_dst_system) ||
|
|
(!is_src_system && is_dst_system)) {
|
|
// Use staging buffer or pin if either src or dst is gpuvm and the other
|
|
// is system memory allocated via OS or C/C++ allocator.
|
|
return CopyMemoryHostAlloc(dst, src, size, is_dst_system);
|
|
}
|
|
}
|
|
|
|
if (is_src_system && is_dst_system) {
|
|
memmove(dst, src, size);
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
return blit_agent_->DmaCopy(dst, src, size);
|
|
}
|
|
|
|
hsa_status_t Runtime::CopyMemoryHostAlloc(void* dst, const void* src,
|
|
size_t size, bool dst_malloc) {
|
|
void* usrptr = (dst_malloc) ? dst : const_cast<void*>(src);
|
|
void* agent_ptr = NULL;
|
|
|
|
hsa_agent_t blit_agent = core::Agent::Convert(blit_agent_);
|
|
|
|
const amd::MemoryRegion* system_region =
|
|
reinterpret_cast<const amd::MemoryRegion*>(system_regions_fine_[0]);
|
|
hsa_status_t stat =
|
|
system_region->Lock(1, &blit_agent, usrptr, size, &agent_ptr);
|
|
|
|
if (stat != HSA_STATUS_SUCCESS) {
|
|
return stat;
|
|
}
|
|
|
|
stat = blit_agent_->DmaCopy((dst_malloc) ? agent_ptr : dst,
|
|
(dst_malloc) ? src : agent_ptr, size);
|
|
|
|
system_region->Unlock(usrptr);
|
|
|
|
return stat;
|
|
}
|
|
|
|
hsa_status_t Runtime::CopyMemory(void* dst, core::Agent& dst_agent,
|
|
const void* src, core::Agent& src_agent,
|
|
size_t size,
|
|
std::vector<core::Signal*>& dep_signals,
|
|
core::Signal& completion_signal) {
|
|
const bool dst_gpu =
|
|
(dst_agent.device_type() == core::Agent::DeviceType::kAmdGpuDevice);
|
|
const bool src_gpu =
|
|
(src_agent.device_type() == core::Agent::DeviceType::kAmdGpuDevice);
|
|
if (dst_gpu || src_gpu) {
|
|
core::Agent& copy_agent = (src_gpu) ? src_agent : dst_agent;
|
|
return copy_agent.DmaCopy(dst, dst_agent, src, src_agent, size, dep_signals,
|
|
completion_signal);
|
|
}
|
|
|
|
// For cpu to cpu, fire and forget a copy thread.
|
|
const bool profiling_enabled =
|
|
(dst_agent.profiling_enabled() || src_agent.profiling_enabled());
|
|
std::thread(
|
|
[](void* dst, const void* src, size_t size,
|
|
std::vector<core::Signal*> dep_signals,
|
|
core::Signal* completion_signal, bool profiling_enabled) {
|
|
|
|
for (core::Signal* dep : dep_signals) {
|
|
dep->WaitRelaxed(HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
|
|
HSA_WAIT_STATE_BLOCKED);
|
|
}
|
|
|
|
if (profiling_enabled) {
|
|
HsaClockCounters clocks = {0};
|
|
core::Runtime::runtime_singleton_->GetSystemInfo(
|
|
HSA_SYSTEM_INFO_TIMESTAMP, reinterpret_cast<void*>(&clocks));
|
|
completion_signal->signal_.start_ts = clocks.SystemClockCounter;
|
|
}
|
|
|
|
memcpy(dst, src, size);
|
|
|
|
if (profiling_enabled) {
|
|
HsaClockCounters clocks = {0};
|
|
core::Runtime::runtime_singleton_->GetSystemInfo(
|
|
HSA_SYSTEM_INFO_TIMESTAMP, reinterpret_cast<void*>(&clocks));
|
|
completion_signal->signal_.end_ts = clocks.SystemClockCounter;
|
|
}
|
|
|
|
completion_signal->SubRelease(1);
|
|
},
|
|
dst, src, size, dep_signals, &completion_signal,
|
|
profiling_enabled).detach();
|
|
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
hsa_status_t Runtime::FillMemory(void* ptr, uint32_t value, size_t count) {
|
|
// Choose blit agent from pointer info
|
|
hsa_amd_pointer_info_t info;
|
|
uint32_t agent_count;
|
|
hsa_agent_t* accessible = nullptr;
|
|
info.size = sizeof(info);
|
|
MAKE_SCOPE_GUARD([&]() { free(accessible); });
|
|
hsa_status_t err = PtrInfo(ptr, &info, malloc, &agent_count, &accessible);
|
|
if (err != HSA_STATUS_SUCCESS) return err;
|
|
|
|
ptrdiff_t endPtr = (ptrdiff_t)ptr + count * sizeof(uint32_t);
|
|
|
|
// Check for GPU fill
|
|
// Selects GPU fill for SVM and Locked allocations if a GPU address is given and is mapped.
|
|
if (info.agentBaseAddress <= ptr &&
|
|
endPtr <= (ptrdiff_t)info.agentBaseAddress + info.sizeInBytes) {
|
|
core::Agent* blit_agent = core::Agent::Convert(info.agentOwner);
|
|
if (blit_agent->device_type() != core::Agent::DeviceType::kAmdGpuDevice) {
|
|
blit_agent = nullptr;
|
|
for (int i = 0; i < agent_count; i++) {
|
|
if (core::Agent::Convert(accessible[i])->device_type() ==
|
|
core::Agent::DeviceType::kAmdGpuDevice) {
|
|
blit_agent = core::Agent::Convert(accessible[i]);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
if (blit_agent) return blit_agent->DmaFill(ptr, value, count);
|
|
}
|
|
|
|
// Host and unmapped SVM addresses copy via host.
|
|
if (info.hostBaseAddress <= ptr && endPtr <= (ptrdiff_t)info.hostBaseAddress + info.sizeInBytes) {
|
|
memset(ptr, value, count * sizeof(uint32_t));
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
return HSA_STATUS_ERROR_INVALID_ALLOCATION;
|
|
}
|
|
|
|
hsa_status_t Runtime::AllowAccess(uint32_t num_agents,
|
|
const hsa_agent_t* agents, const void* ptr) {
|
|
const amd::MemoryRegion* amd_region = NULL;
|
|
size_t alloc_size = 0;
|
|
|
|
{
|
|
ScopedAcquire<KernelMutex> lock(&memory_lock_);
|
|
|
|
std::map<const void*, AllocationRegion>::const_iterator it = allocation_map_.find(ptr);
|
|
|
|
if (it == allocation_map_.end()) {
|
|
return HSA_STATUS_ERROR;
|
|
}
|
|
|
|
amd_region = reinterpret_cast<const amd::MemoryRegion*>(it->second.region);
|
|
alloc_size = it->second.size;
|
|
}
|
|
|
|
return amd_region->AllowAccess(num_agents, agents, ptr, alloc_size);
|
|
}
|
|
|
|
hsa_status_t Runtime::GetSystemInfo(hsa_system_info_t attribute, void* value) {
|
|
switch (attribute) {
|
|
case HSA_SYSTEM_INFO_VERSION_MAJOR:
|
|
*((uint16_t*)value) = HSA_VERSION_MAJOR;
|
|
break;
|
|
case HSA_SYSTEM_INFO_VERSION_MINOR:
|
|
*((uint16_t*)value) = HSA_VERSION_MINOR;
|
|
break;
|
|
case HSA_SYSTEM_INFO_TIMESTAMP: {
|
|
HsaClockCounters clocks;
|
|
hsaKmtGetClockCounters(0, &clocks);
|
|
*((uint64_t*)value) = clocks.SystemClockCounter;
|
|
break;
|
|
}
|
|
case HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY: {
|
|
assert(sys_clock_freq_ != 0 &&
|
|
"Use of HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY before HSA "
|
|
"initialization completes.");
|
|
*(uint64_t*)value = sys_clock_freq_;
|
|
break;
|
|
}
|
|
case HSA_SYSTEM_INFO_SIGNAL_MAX_WAIT:
|
|
*((uint64_t*)value) = 0xFFFFFFFFFFFFFFFF;
|
|
break;
|
|
case HSA_SYSTEM_INFO_ENDIANNESS:
|
|
#if defined(HSA_LITTLE_ENDIAN)
|
|
*((hsa_endianness_t*)value) = HSA_ENDIANNESS_LITTLE;
|
|
#else
|
|
*((hsa_endianness_t*)value) = HSA_ENDIANNESS_BIG;
|
|
#endif
|
|
break;
|
|
case HSA_SYSTEM_INFO_MACHINE_MODEL:
|
|
#if defined(HSA_LARGE_MODEL)
|
|
*((hsa_machine_model_t*)value) = HSA_MACHINE_MODEL_LARGE;
|
|
#else
|
|
*((hsa_machine_model_t*)value) = HSA_MACHINE_MODEL_SMALL;
|
|
#endif
|
|
break;
|
|
case HSA_SYSTEM_INFO_EXTENSIONS: {
|
|
memset(value, 0, sizeof(uint8_t) * 128);
|
|
|
|
auto setFlag = [&](uint32_t bit) {
|
|
assert(bit < 128 * 8 && "Extension value exceeds extension bitmask");
|
|
uint index = bit / 8;
|
|
uint subBit = bit % 8;
|
|
((uint8_t*)value)[index] |= 1 << subBit;
|
|
};
|
|
|
|
if (hsa_internal_api_table_.finalizer_api.hsa_ext_program_finalize_fn != NULL) {
|
|
setFlag(HSA_EXTENSION_FINALIZER);
|
|
}
|
|
|
|
if (hsa_internal_api_table_.image_api.hsa_ext_image_create_fn != NULL) {
|
|
setFlag(HSA_EXTENSION_IMAGES);
|
|
}
|
|
|
|
if (hsa_internal_api_table_.aqlprofile_api.hsa_ven_amd_aqlprofile_error_string_fn != NULL) {
|
|
setFlag(HSA_EXTENSION_AMD_AQLPROFILE);
|
|
}
|
|
|
|
setFlag(HSA_EXTENSION_AMD_PROFILER);
|
|
|
|
break;
|
|
}
|
|
default:
|
|
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
|
}
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
uint32_t Runtime::GetQueueId() { return atomic::Increment(&queue_count_); }
|
|
|
|
hsa_status_t Runtime::SetAsyncSignalHandler(hsa_signal_t signal,
|
|
hsa_signal_condition_t cond,
|
|
hsa_signal_value_t value,
|
|
hsa_amd_signal_handler handler,
|
|
void* arg) {
|
|
// Indicate that this signal is in use.
|
|
if (signal.handle != 0) hsa_signal_handle(signal)->Retain();
|
|
|
|
ScopedAcquire<KernelMutex> scope_lock(&async_events_control_.lock);
|
|
|
|
// Lazy initializer
|
|
if (async_events_control_.async_events_thread_ == NULL) {
|
|
// Create monitoring thread control signal
|
|
auto err = HSA::hsa_signal_create(0, 0, NULL, &async_events_control_.wake);
|
|
if (err != HSA_STATUS_SUCCESS) {
|
|
assert(false && "Asyncronous events control signal creation error.");
|
|
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
|
}
|
|
async_events_.PushBack(async_events_control_.wake, HSA_SIGNAL_CONDITION_NE,
|
|
0, NULL, NULL);
|
|
|
|
// Start event monitoring thread
|
|
async_events_control_.exit = false;
|
|
async_events_control_.async_events_thread_ =
|
|
os::CreateThread(AsyncEventsLoop, NULL);
|
|
if (async_events_control_.async_events_thread_ == NULL) {
|
|
assert(false && "Asyncronous events thread creation error.");
|
|
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
|
}
|
|
}
|
|
|
|
new_async_events_.PushBack(signal, cond, value, handler, arg);
|
|
|
|
hsa_signal_handle(async_events_control_.wake)->StoreRelease(1);
|
|
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
hsa_status_t Runtime::InteropMap(uint32_t num_agents, Agent** agents,
|
|
int interop_handle, uint32_t flags,
|
|
size_t* size, void** ptr,
|
|
size_t* metadata_size, const void** metadata) {
|
|
static const int tinyArraySize=8;
|
|
HsaGraphicsResourceInfo info;
|
|
|
|
HSAuint32 short_nodes[tinyArraySize];
|
|
HSAuint32* nodes = short_nodes;
|
|
if (num_agents > tinyArraySize) {
|
|
nodes = new HSAuint32[num_agents];
|
|
if (nodes == NULL) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
|
}
|
|
MAKE_SCOPE_GUARD([&]() {
|
|
if (num_agents > tinyArraySize) delete[] nodes;
|
|
});
|
|
|
|
for (int i = 0; i < num_agents; i++)
|
|
agents[i]->GetInfo((hsa_agent_info_t)HSA_AMD_AGENT_INFO_DRIVER_NODE_ID,
|
|
&nodes[i]);
|
|
|
|
if (hsaKmtRegisterGraphicsHandleToNodes(interop_handle, &info, num_agents,
|
|
nodes) != HSAKMT_STATUS_SUCCESS)
|
|
return HSA_STATUS_ERROR;
|
|
|
|
HSAuint64 altAddress;
|
|
HsaMemMapFlags map_flags;
|
|
map_flags.Value = 0;
|
|
map_flags.ui32.PageSize = HSA_PAGE_SIZE_64KB;
|
|
if (hsaKmtMapMemoryToGPUNodes(info.MemoryAddress, info.SizeInBytes,
|
|
&altAddress, map_flags, num_agents,
|
|
nodes) != HSAKMT_STATUS_SUCCESS) {
|
|
map_flags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
|
|
if (hsaKmtMapMemoryToGPUNodes(info.MemoryAddress, info.SizeInBytes, &altAddress, map_flags,
|
|
num_agents, nodes) != HSAKMT_STATUS_SUCCESS) {
|
|
hsaKmtDeregisterMemory(info.MemoryAddress);
|
|
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
|
}
|
|
}
|
|
|
|
if (metadata_size != NULL) *metadata_size = info.MetadataSizeInBytes;
|
|
if (metadata != NULL) *metadata = info.Metadata;
|
|
|
|
*size = info.SizeInBytes;
|
|
*ptr = info.MemoryAddress;
|
|
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
hsa_status_t Runtime::InteropUnmap(void* ptr) {
|
|
if(hsaKmtUnmapMemoryToGPU(ptr)!=HSAKMT_STATUS_SUCCESS)
|
|
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
|
if(hsaKmtDeregisterMemory(ptr)!=HSAKMT_STATUS_SUCCESS)
|
|
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
hsa_status_t Runtime::PtrInfo(void* ptr, hsa_amd_pointer_info_t* info, void* (*alloc)(size_t),
|
|
uint32_t* num_agents_accessible, hsa_agent_t** accessible,
|
|
PtrInfoBlockData* block_info) {
|
|
static_assert(static_cast<int>(HSA_POINTER_UNKNOWN) == static_cast<int>(HSA_EXT_POINTER_TYPE_UNKNOWN),
|
|
"Thunk pointer info mismatch");
|
|
static_assert(static_cast<int>(HSA_POINTER_ALLOCATED) == static_cast<int>(HSA_EXT_POINTER_TYPE_HSA),
|
|
"Thunk pointer info mismatch");
|
|
static_assert(static_cast<int>(HSA_POINTER_REGISTERED_USER) == static_cast<int>(HSA_EXT_POINTER_TYPE_LOCKED),
|
|
"Thunk pointer info mismatch");
|
|
static_assert(static_cast<int>(HSA_POINTER_REGISTERED_GRAPHICS) == static_cast<int>(HSA_EXT_POINTER_TYPE_GRAPHICS),
|
|
"Thunk pointer info mismatch");
|
|
|
|
HsaPointerInfo thunkInfo;
|
|
uint32_t* mappedNodes;
|
|
|
|
hsa_amd_pointer_info_t retInfo;
|
|
|
|
// check output struct has an initialized size.
|
|
if (info->size == 0) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
|
|
|
bool returnListData =
|
|
((alloc != nullptr) && (num_agents_accessible != nullptr) && (accessible != nullptr));
|
|
|
|
{ // memory_lock protects access to the NMappedNodes array and fragment user data since these may
|
|
// change with calls to memory APIs.
|
|
ScopedAcquire<KernelMutex> lock(&memory_lock_);
|
|
hsaKmtQueryPointerInfo(ptr, &thunkInfo);
|
|
if (returnListData) {
|
|
assert(thunkInfo.NMappedNodes <= agents_by_node_.size() &&
|
|
"PointerInfo: Thunk returned more than all agents in NMappedNodes.");
|
|
mappedNodes = (uint32_t*)alloca(thunkInfo.NMappedNodes * sizeof(uint32_t));
|
|
memcpy(mappedNodes, thunkInfo.MappedNodes, thunkInfo.NMappedNodes * sizeof(uint32_t));
|
|
}
|
|
retInfo.type = (hsa_amd_pointer_type_t)thunkInfo.Type;
|
|
retInfo.agentBaseAddress = reinterpret_cast<void*>(thunkInfo.GPUAddress);
|
|
retInfo.hostBaseAddress = thunkInfo.CPUAddress;
|
|
retInfo.sizeInBytes = thunkInfo.SizeInBytes;
|
|
retInfo.userData = thunkInfo.UserData;
|
|
if (block_info != nullptr) {
|
|
// The only time host and agent ptr may be different is when the memory is lock memory (malloc
|
|
// memory pinned for GPU access). In this case there can not be any suballocation so
|
|
// block_info is redundant and unused. Host address is returned since host address is used to
|
|
// manipulate lock memory. This protects future use of block_info with lock memory.
|
|
block_info->base = retInfo.hostBaseAddress;
|
|
block_info->length = retInfo.sizeInBytes;
|
|
}
|
|
if (retInfo.type == HSA_EXT_POINTER_TYPE_HSA) {
|
|
auto fragment = allocation_map_.upper_bound(ptr);
|
|
if (fragment != allocation_map_.begin()) {
|
|
fragment--;
|
|
if ((fragment->first <= ptr) &&
|
|
(ptr < reinterpret_cast<const uint8_t*>(fragment->first) + fragment->second.size)) {
|
|
// agent and host address must match here. Only lock memory is allowed to have differing
|
|
// addresses but lock memory has type HSA_EXT_POINTER_TYPE_LOCKED and cannot be
|
|
// suballocated.
|
|
retInfo.agentBaseAddress = const_cast<void*>(fragment->first);
|
|
retInfo.hostBaseAddress = retInfo.agentBaseAddress;
|
|
retInfo.sizeInBytes = fragment->second.size;
|
|
retInfo.userData = fragment->second.user_ptr;
|
|
}
|
|
}
|
|
}
|
|
} // end lock scope
|
|
|
|
retInfo.size = Min(info->size, sizeof(hsa_amd_pointer_info_t));
|
|
|
|
// Temp: workaround thunk bug, IPC memory has garbage in Node.
|
|
// retInfo.agentOwner = agents_by_node_[thunkInfo.Node][0]->public_handle();
|
|
auto it = agents_by_node_.find(thunkInfo.Node);
|
|
if (it != agents_by_node_.end())
|
|
retInfo.agentOwner = agents_by_node_[thunkInfo.Node][0]->public_handle();
|
|
else
|
|
retInfo.agentOwner.handle = 0;
|
|
|
|
memcpy(info, &retInfo, retInfo.size);
|
|
|
|
if (returnListData) {
|
|
uint32_t count = 0;
|
|
for (int i = 0; i < thunkInfo.NMappedNodes; i++) {
|
|
assert(mappedNodes[i] < agents_by_node_.size() &&
|
|
"PointerInfo: Invalid node ID returned from thunk.");
|
|
count += agents_by_node_[mappedNodes[i]].size();
|
|
}
|
|
|
|
*accessible = (hsa_agent_t*)alloc(sizeof(hsa_agent_t) * count);
|
|
if ((*accessible) == nullptr) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
|
*num_agents_accessible = count;
|
|
|
|
uint32_t index = 0;
|
|
for (int i = 0; i < thunkInfo.NMappedNodes; i++) {
|
|
auto& list = agents_by_node_[mappedNodes[i]];
|
|
for (int j = 0; j < list.size(); j++) {
|
|
(*accessible)[index] = list[j]->public_handle();
|
|
index++;
|
|
}
|
|
}
|
|
}
|
|
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
hsa_status_t Runtime::SetPtrInfoData(void* ptr, void* userptr) {
|
|
{ // Use allocation map if possible to handle fragments.
|
|
ScopedAcquire<KernelMutex> lock(&memory_lock_);
|
|
const auto& it = allocation_map_.find(ptr);
|
|
if (it != allocation_map_.end()) {
|
|
it->second.user_ptr = userptr;
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
}
|
|
// Cover entries not in the allocation map (graphics, lock,...)
|
|
if (hsaKmtSetMemoryUserData(ptr, userptr) == HSAKMT_STATUS_SUCCESS)
|
|
return HSA_STATUS_SUCCESS;
|
|
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
|
}
|
|
|
|
hsa_status_t Runtime::IPCCreate(void* ptr, size_t len, hsa_amd_ipc_memory_t* handle) {
|
|
static_assert(sizeof(hsa_amd_ipc_memory_t) == sizeof(HsaSharedMemoryHandle),
|
|
"Thunk IPC mismatch.");
|
|
// Reject sharing allocations larger than ~8TB due to thunk limitations.
|
|
if (len > 0x7FFFFFFF000ull) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
|
|
|
// Check for fragment sharing.
|
|
PtrInfoBlockData block;
|
|
hsa_amd_pointer_info_t info;
|
|
info.size = sizeof(info);
|
|
if (PtrInfo(ptr, &info, nullptr, nullptr, nullptr, &block) != HSA_STATUS_SUCCESS)
|
|
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
|
if ((block.base != ptr) || (block.length != len)) {
|
|
if (!IsMultipleOf(block.base, 2 * 1024 * 1024)) {
|
|
assert(false && "Fragment's block not aligned to 2MB!");
|
|
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
|
}
|
|
if (hsaKmtShareMemory(block.base, block.length, reinterpret_cast<HsaSharedMemoryHandle*>(
|
|
handle)) != HSAKMT_STATUS_SUCCESS)
|
|
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
|
uint32_t offset =
|
|
(reinterpret_cast<uint8_t*>(ptr) - reinterpret_cast<uint8_t*>(block.base)) / 4096;
|
|
// Holds size in (4K?) pages in thunk handle: Mark as a fragment and denote offset.
|
|
handle->handle[6] |= 0x80000000 | offset;
|
|
} else {
|
|
if (hsaKmtShareMemory(ptr, len, reinterpret_cast<HsaSharedMemoryHandle*>(handle)) !=
|
|
HSAKMT_STATUS_SUCCESS)
|
|
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
|
}
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
hsa_status_t Runtime::IPCAttach(const hsa_amd_ipc_memory_t* handle, size_t len, uint32_t num_agents,
|
|
Agent** agents, void** mapped_ptr) {
|
|
static const int tinyArraySize = 8;
|
|
void* importAddress;
|
|
HSAuint64 importSize;
|
|
HSAuint64 altAddress;
|
|
|
|
hsa_amd_ipc_memory_t importHandle;
|
|
importHandle = *handle;
|
|
|
|
// Extract fragment info
|
|
bool isFragment = false;
|
|
uint32_t fragOffset = 0;
|
|
auto fixFragment = [&]() {
|
|
if (!isFragment) return;
|
|
importAddress = reinterpret_cast<uint8_t*>(importAddress) + fragOffset;
|
|
len = Min(len, importSize - fragOffset);
|
|
ScopedAcquire<KernelMutex> lock(&memory_lock_);
|
|
allocation_map_[importAddress] = AllocationRegion(nullptr, len);
|
|
};
|
|
|
|
if ((importHandle.handle[6] & 0x80000000) != 0) {
|
|
isFragment = true;
|
|
fragOffset = (importHandle.handle[6] & 0x1FF) * 4096;
|
|
importHandle.handle[6] &= ~(0x80000000 | 0x1FF);
|
|
}
|
|
|
|
if (num_agents == 0) {
|
|
if (hsaKmtRegisterSharedHandle(reinterpret_cast<const HsaSharedMemoryHandle*>(&importHandle),
|
|
&importAddress, &importSize) != HSAKMT_STATUS_SUCCESS)
|
|
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
|
if (hsaKmtMapMemoryToGPU(importAddress, importSize, &altAddress) != HSAKMT_STATUS_SUCCESS) {
|
|
hsaKmtDeregisterMemory(importAddress);
|
|
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
|
}
|
|
fixFragment();
|
|
*mapped_ptr = importAddress;
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
HSAuint32* nodes = nullptr;
|
|
if (num_agents > tinyArraySize)
|
|
nodes = new HSAuint32[num_agents];
|
|
else
|
|
nodes = (HSAuint32*)alloca(sizeof(HSAuint32) * num_agents);
|
|
if (nodes == NULL) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
|
|
|
MAKE_SCOPE_GUARD([&]() {
|
|
if (num_agents > tinyArraySize) delete[] nodes;
|
|
});
|
|
|
|
for (int i = 0; i < num_agents; i++)
|
|
agents[i]->GetInfo((hsa_agent_info_t)HSA_AMD_AGENT_INFO_DRIVER_NODE_ID, &nodes[i]);
|
|
|
|
if (hsaKmtRegisterSharedHandleToNodes(
|
|
reinterpret_cast<const HsaSharedMemoryHandle*>(&importHandle), &importAddress,
|
|
&importSize, num_agents, nodes) != HSAKMT_STATUS_SUCCESS)
|
|
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
|
|
|
HsaMemMapFlags map_flags;
|
|
map_flags.Value = 0;
|
|
map_flags.ui32.PageSize = HSA_PAGE_SIZE_64KB;
|
|
if (hsaKmtMapMemoryToGPUNodes(importAddress, importSize, &altAddress, map_flags, num_agents,
|
|
nodes) != HSAKMT_STATUS_SUCCESS) {
|
|
map_flags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
|
|
if (hsaKmtMapMemoryToGPUNodes(importAddress, importSize, &altAddress, map_flags, num_agents,
|
|
nodes) != HSAKMT_STATUS_SUCCESS) {
|
|
hsaKmtDeregisterMemory(importAddress);
|
|
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
|
}
|
|
}
|
|
|
|
fixFragment();
|
|
*mapped_ptr = importAddress;
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
hsa_status_t Runtime::IPCDetach(void* ptr) {
|
|
{ // Handle imported fragments.
|
|
ScopedAcquire<KernelMutex> lock(&memory_lock_);
|
|
const auto& it = allocation_map_.find(ptr);
|
|
if (it != allocation_map_.end()) {
|
|
if (it->second.region != nullptr) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
|
allocation_map_.erase(it);
|
|
lock.Release(); // Can't hold memory lock when using pointer info.
|
|
|
|
PtrInfoBlockData block;
|
|
hsa_amd_pointer_info_t info;
|
|
info.size = sizeof(info);
|
|
if (PtrInfo(ptr, &info, nullptr, nullptr, nullptr, &block) != HSA_STATUS_SUCCESS)
|
|
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
|
ptr = block.base;
|
|
}
|
|
}
|
|
if (hsaKmtUnmapMemoryToGPU(ptr) != HSAKMT_STATUS_SUCCESS)
|
|
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
|
if (hsaKmtDeregisterMemory(ptr) != HSAKMT_STATUS_SUCCESS)
|
|
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
void Runtime::AsyncEventsLoop(void*) {
|
|
auto& async_events_control_ = runtime_singleton_->async_events_control_;
|
|
auto& async_events_ = runtime_singleton_->async_events_;
|
|
auto& new_async_events_ = runtime_singleton_->new_async_events_;
|
|
|
|
while (!async_events_control_.exit) {
|
|
// Wait for a signal
|
|
hsa_signal_value_t value;
|
|
uint32_t index = AMD::hsa_amd_signal_wait_any(
|
|
uint32_t(async_events_.Size()), &async_events_.signal_[0],
|
|
&async_events_.cond_[0], &async_events_.value_[0], uint64_t(-1),
|
|
HSA_WAIT_STATE_BLOCKED, &value);
|
|
|
|
// Reset the control signal
|
|
if (index == 0) {
|
|
hsa_signal_handle(async_events_control_.wake)->StoreRelaxed(0);
|
|
} else if (index != -1) {
|
|
// No error or timout occured, process the handler
|
|
assert(async_events_.handler_[index] != NULL);
|
|
bool keep =
|
|
async_events_.handler_[index](value, async_events_.arg_[index]);
|
|
if (!keep) {
|
|
hsa_signal_handle(async_events_.signal_[index])->Release();
|
|
async_events_.CopyIndex(index, async_events_.Size() - 1);
|
|
async_events_.PopBack();
|
|
}
|
|
}
|
|
|
|
// Check for dead signals
|
|
index = 0;
|
|
while (index != async_events_.Size()) {
|
|
if (!hsa_signal_handle(async_events_.signal_[index])->IsValid()) {
|
|
hsa_signal_handle(async_events_.signal_[index])->Release();
|
|
async_events_.CopyIndex(index, async_events_.Size() - 1);
|
|
async_events_.PopBack();
|
|
continue;
|
|
}
|
|
index++;
|
|
}
|
|
|
|
// Insert new signals and find plain functions
|
|
typedef std::pair<void (*)(void*), void*> func_arg_t;
|
|
std::vector<func_arg_t> functions;
|
|
{
|
|
ScopedAcquire<KernelMutex> scope_lock(&async_events_control_.lock);
|
|
for (size_t i = 0; i < new_async_events_.Size(); i++) {
|
|
if (new_async_events_.signal_[i].handle == 0) {
|
|
functions.push_back(
|
|
func_arg_t((void (*)(void*))new_async_events_.handler_[i],
|
|
new_async_events_.arg_[i]));
|
|
continue;
|
|
}
|
|
async_events_.PushBack(
|
|
new_async_events_.signal_[i], new_async_events_.cond_[i],
|
|
new_async_events_.value_[i], new_async_events_.handler_[i],
|
|
new_async_events_.arg_[i]);
|
|
}
|
|
new_async_events_.Clear();
|
|
}
|
|
|
|
// Call plain functions
|
|
for (size_t i = 0; i < functions.size(); i++)
|
|
functions[i].first(functions[i].second);
|
|
functions.clear();
|
|
}
|
|
|
|
// Release wait count of all pending signals
|
|
for (size_t i = 1; i < async_events_.Size(); i++)
|
|
hsa_signal_handle(async_events_.signal_[i])->Release();
|
|
async_events_.Clear();
|
|
|
|
for (size_t i = 0; i < new_async_events_.Size(); i++)
|
|
hsa_signal_handle(new_async_events_.signal_[i])->Release();
|
|
new_async_events_.Clear();
|
|
}
|
|
|
|
void Runtime::BindVmFaultHandler() {
|
|
if (core::g_use_interrupt_wait) {
|
|
// Create memory event with manual reset to avoid racing condition
|
|
// with driver in case of multiple concurrent VM faults.
|
|
vm_fault_event_ =
|
|
core::InterruptSignal::CreateEvent(HSA_EVENTTYPE_MEMORY, true);
|
|
|
|
// Create an interrupt signal object to contain the memory event.
|
|
// This signal object will be registered with the async handler global
|
|
// thread.
|
|
vm_fault_signal_ = new core::InterruptSignal(0, vm_fault_event_);
|
|
|
|
if (!vm_fault_signal_->IsValid() || vm_fault_signal_->EopEvent() == NULL) {
|
|
assert(false && "Failed on creating VM fault signal");
|
|
return;
|
|
}
|
|
|
|
SetAsyncSignalHandler(core::Signal::Convert(vm_fault_signal_),
|
|
HSA_SIGNAL_CONDITION_NE, 0, VMFaultHandler,
|
|
reinterpret_cast<void*>(vm_fault_signal_));
|
|
}
|
|
}
|
|
|
|
bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) {
|
|
core::InterruptSignal* vm_fault_signal =
|
|
reinterpret_cast<core::InterruptSignal*>(arg);
|
|
|
|
assert(vm_fault_signal != NULL);
|
|
|
|
if (vm_fault_signal == NULL) {
|
|
return false;
|
|
}
|
|
|
|
if (runtime_singleton_->flag().enable_vm_fault_message()) {
|
|
HsaEvent* vm_fault_event = vm_fault_signal->EopEvent();
|
|
|
|
const HsaMemoryAccessFault& fault =
|
|
vm_fault_event->EventData.EventData.MemoryAccessFault;
|
|
|
|
std::string reason = "";
|
|
if (fault.Failure.NotPresent == 1) {
|
|
reason += "Page not present or supervisor privilege";
|
|
} else if (fault.Failure.ReadOnly == 1) {
|
|
reason += "Write access to a read-only page";
|
|
} else if (fault.Failure.NoExecute == 1) {
|
|
reason += "Execute access to a page marked NX";
|
|
} else if (fault.Failure.GpuAccess == 1) {
|
|
reason += "Host access only";
|
|
} else if (fault.Failure.ECC == 1) {
|
|
reason += "ECC failure (if supported by HW)";
|
|
} else {
|
|
reason += "Unknown";
|
|
}
|
|
|
|
fprintf(stderr,
|
|
"Memory access fault by GPU node-%u on address %p%s. Reason: %s.\n",
|
|
fault.NodeId, reinterpret_cast<const void*>(fault.VirtualAddress),
|
|
(fault.Failure.Imprecise == 1) ? "(may not be exact address)" : "",
|
|
reason.c_str());
|
|
} else {
|
|
assert(false && "GPU memory access fault.");
|
|
}
|
|
|
|
std::abort();
|
|
|
|
// No need to keep the signal because we are done.
|
|
return false;
|
|
}
|
|
|
|
Runtime::Runtime()
|
|
: blit_agent_(NULL),
|
|
queue_count_(0),
|
|
sys_clock_freq_(0),
|
|
vm_fault_event_(nullptr),
|
|
vm_fault_signal_(nullptr),
|
|
ref_count_(0) {
|
|
start_svm_address_ = 0;
|
|
#if defined(HSA_LARGE_MODEL)
|
|
end_svm_address_ = UINT64_MAX;
|
|
#else
|
|
end_svm_address_ = UINT32_MAX;
|
|
#endif
|
|
}
|
|
|
|
hsa_status_t Runtime::Load() {
|
|
flag_.Refresh();
|
|
|
|
g_use_interrupt_wait = flag_.enable_interrupt();
|
|
|
|
if (!amd::Load()) {
|
|
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
|
}
|
|
|
|
loader_ = amd::hsa::loader::Loader::Create(&loader_context_);
|
|
|
|
// Load extensions
|
|
LoadExtensions();
|
|
|
|
// Load tools libraries
|
|
LoadTools();
|
|
|
|
for (core::Agent* agent : gpu_agents_) {
|
|
hsa_status_t status =
|
|
reinterpret_cast<amd::GpuAgentInt*>(agent)->PostToolsInit();
|
|
|
|
if (status != HSA_STATUS_SUCCESS) {
|
|
return status;
|
|
}
|
|
}
|
|
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
void Runtime::Unload() {
|
|
UnloadTools();
|
|
UnloadExtensions();
|
|
|
|
amd::hsa::loader::Loader::Destroy(loader_);
|
|
loader_ = nullptr;
|
|
|
|
async_events_control_.Shutdown();
|
|
|
|
if (vm_fault_signal_ != nullptr) {
|
|
vm_fault_signal_->DestroySignal();
|
|
vm_fault_signal_ = nullptr;
|
|
}
|
|
core::InterruptSignal::DestroyEvent(vm_fault_event_);
|
|
vm_fault_event_ = nullptr;
|
|
|
|
DestroyAgents();
|
|
|
|
CloseTools();
|
|
|
|
amd::Unload();
|
|
}
|
|
|
|
void Runtime::LoadExtensions() {
|
|
// Load finalizer and extension library
|
|
#ifdef HSA_LARGE_MODEL
|
|
static const std::string kFinalizerLib[] = {"hsa-ext-finalize64.dll",
|
|
"libhsa-ext-finalize64.so.1"};
|
|
static const std::string kImageLib[] = {"hsa-ext-image64.dll",
|
|
"libhsa-ext-image64.so.1"};
|
|
static const std::string kAqlProfileLib[] = {"hsa-amd-aqlprofile64.dll",
|
|
"libhsa-amd-aqlprofile64.so.1"};
|
|
#else
|
|
static const std::string kFinalizerLib[] = {"hsa-ext-finalize.dll",
|
|
"libhsa-ext-finalize.so.1"};
|
|
static const std::string kImageLib[] = {"hsa-ext-image.dll",
|
|
"libhsa-ext-image.so.1"};
|
|
static const std::string kAqlProfileLib[] = {"hsa-amd-aqlprofile.dll",
|
|
"libhsa-amd-aqlprofile.so.1"};
|
|
#endif
|
|
|
|
// Update Hsa Api Table with handle of Image extension Apis
|
|
extensions_.LoadFinalizer(kFinalizerLib[os_index(os::current_os)]);
|
|
hsa_api_table_.LinkExts(&extensions_.finalizer_api,
|
|
core::HsaApiTable::HSA_EXT_FINALIZER_API_TABLE_ID);
|
|
|
|
// Update Hsa Api Table with handle of Finalizer extension Apis
|
|
extensions_.LoadImage(kImageLib[os_index(os::current_os)]);
|
|
hsa_api_table_.LinkExts(&extensions_.image_api,
|
|
core::HsaApiTable::HSA_EXT_IMAGE_API_TABLE_ID);
|
|
|
|
// Update Hsa Api Table with handle of AqlProfile extension Apis
|
|
extensions_.LoadAqlProfileApi(kAqlProfileLib[os_index(os::current_os)]);
|
|
hsa_api_table_.LinkExts(&extensions_.aqlprofile_api,
|
|
core::HsaApiTable::HSA_EXT_AQLPROFILE_API_TABLE_ID);
|
|
}
|
|
|
|
void Runtime::UnloadExtensions() { extensions_.Unload(); }
|
|
|
|
static std::vector<std::string> parse_tool_names(std::string tool_names) {
|
|
std::vector<std::string> names;
|
|
std::string name = "";
|
|
bool quoted = false;
|
|
while (tool_names.size() != 0) {
|
|
auto index = tool_names.find_first_of(" \"\\");
|
|
if (index == std::string::npos) {
|
|
name += tool_names;
|
|
break;
|
|
}
|
|
switch (tool_names[index]) {
|
|
case ' ': {
|
|
if (!quoted) {
|
|
name += tool_names.substr(0, index);
|
|
tool_names.erase(0, index + 1);
|
|
names.push_back(name);
|
|
name = "";
|
|
} else {
|
|
name += tool_names.substr(0, index + 1);
|
|
tool_names.erase(0, index + 1);
|
|
}
|
|
break;
|
|
}
|
|
case '\"': {
|
|
if (quoted) {
|
|
quoted = false;
|
|
name += tool_names.substr(0, index);
|
|
tool_names.erase(0, index + 1);
|
|
names.push_back(name);
|
|
name = "";
|
|
} else {
|
|
quoted = true;
|
|
tool_names.erase(0, index + 1);
|
|
}
|
|
break;
|
|
}
|
|
case '\\': {
|
|
if (tool_names.size() > index + 1) {
|
|
name += tool_names.substr(0, index) + tool_names[index + 1];
|
|
tool_names.erase(0, index + 2);
|
|
}
|
|
break;
|
|
}
|
|
} // end switch
|
|
} // end while
|
|
|
|
if (name != "") names.push_back(name);
|
|
return names;
|
|
}
|
|
|
|
void Runtime::LoadTools() {
|
|
typedef bool (*tool_init_t)(::HsaApiTable*, uint64_t, uint64_t,
|
|
const char* const*);
|
|
typedef Agent* (*tool_wrap_t)(Agent*);
|
|
typedef void (*tool_add_t)(Runtime*);
|
|
|
|
// Load tool libs
|
|
std::string tool_names = flag_.tools_lib_names();
|
|
if (tool_names != "") {
|
|
std::vector<std::string> names = parse_tool_names(tool_names);
|
|
std::vector<const char*> failed;
|
|
for (int i = 0; i < names.size(); i++) {
|
|
os::LibHandle tool = os::LoadLib(names[i]);
|
|
|
|
if (tool != NULL) {
|
|
tool_libs_.push_back(tool);
|
|
|
|
tool_init_t ld;
|
|
ld = (tool_init_t)os::GetExportAddress(tool, "OnLoad");
|
|
if (ld) {
|
|
if (!ld(&hsa_api_table_.hsa_api,
|
|
hsa_api_table_.hsa_api.version.major_id,
|
|
failed.size(), &failed[0])) {
|
|
failed.push_back(names[i].c_str());
|
|
os::CloseLib(tool);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
tool_wrap_t wrap;
|
|
wrap = (tool_wrap_t)os::GetExportAddress(tool, "WrapAgent");
|
|
if (wrap) {
|
|
std::vector<core::Agent*>* agent_lists[2] = {&cpu_agents_,
|
|
&gpu_agents_};
|
|
for (std::vector<core::Agent*>* agent_list : agent_lists) {
|
|
for (size_t agent_idx = 0; agent_idx < agent_list->size();
|
|
++agent_idx) {
|
|
Agent* agent = wrap(agent_list->at(agent_idx));
|
|
if (agent != NULL) {
|
|
assert(agent->IsValid() &&
|
|
"Agent returned from WrapAgent is not valid");
|
|
agent_list->at(agent_idx) = agent;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
tool_add_t add;
|
|
add = (tool_add_t)os::GetExportAddress(tool, "AddAgent");
|
|
if (add) add(this);
|
|
}
|
|
#ifndef NDEBUG
|
|
else {
|
|
if (flag().report_tool_load_failures())
|
|
fprintf(stderr, "Tool lib \"%s\" failed to load.\n", names[i].c_str());
|
|
}
|
|
#endif
|
|
}
|
|
}
|
|
}
|
|
|
|
void Runtime::UnloadTools() {
|
|
typedef void (*tool_unload_t)();
|
|
for (size_t i = tool_libs_.size(); i != 0; i--) {
|
|
tool_unload_t unld;
|
|
unld = (tool_unload_t)os::GetExportAddress(tool_libs_[i - 1], "OnUnload");
|
|
if (unld) unld();
|
|
}
|
|
|
|
// Reset API table in case some tool doesn't cleanup properly
|
|
hsa_api_table_.Reset();
|
|
}
|
|
|
|
void Runtime::CloseTools() {
|
|
// Due to valgrind bug, runtime cannot dlclose extensions see:
|
|
// http://valgrind.org/docs/manual/faq.html#faq.unhelpful
|
|
if (!flag_.running_valgrind()) {
|
|
for (int i = 0; i < tool_libs_.size(); i++) os::CloseLib(tool_libs_[i]);
|
|
}
|
|
tool_libs_.clear();
|
|
}
|
|
|
|
void Runtime::AsyncEventsControl::Shutdown() {
|
|
if (async_events_thread_ != NULL) {
|
|
exit = true;
|
|
hsa_signal_handle(wake)->StoreRelaxed(1);
|
|
os::WaitForThread(async_events_thread_);
|
|
os::CloseThread(async_events_thread_);
|
|
async_events_thread_ = NULL;
|
|
HSA::hsa_signal_destroy(wake);
|
|
}
|
|
}
|
|
|
|
void Runtime::AsyncEvents::PushBack(hsa_signal_t signal,
|
|
hsa_signal_condition_t cond,
|
|
hsa_signal_value_t value,
|
|
hsa_amd_signal_handler handler, void* arg) {
|
|
signal_.push_back(signal);
|
|
cond_.push_back(cond);
|
|
value_.push_back(value);
|
|
handler_.push_back(handler);
|
|
arg_.push_back(arg);
|
|
}
|
|
|
|
void Runtime::AsyncEvents::CopyIndex(size_t dst, size_t src) {
|
|
signal_[dst] = signal_[src];
|
|
cond_[dst] = cond_[src];
|
|
value_[dst] = value_[src];
|
|
handler_[dst] = handler_[src];
|
|
arg_[dst] = arg_[src];
|
|
}
|
|
|
|
size_t Runtime::AsyncEvents::Size() { return signal_.size(); }
|
|
|
|
void Runtime::AsyncEvents::PopBack() {
|
|
signal_.pop_back();
|
|
cond_.pop_back();
|
|
value_.pop_back();
|
|
handler_.pop_back();
|
|
arg_.pop_back();
|
|
}
|
|
|
|
void Runtime::AsyncEvents::Clear() {
|
|
signal_.clear();
|
|
cond_.clear();
|
|
value_.clear();
|
|
handler_.clear();
|
|
arg_.clear();
|
|
}
|
|
|
|
} // namespace core
|