SWDEV-569319 Replace ScopedAcquire with stdcpp wrappers (#2146)

* SWDEV-569319 Replace ScopedAcquire with stdcpp wrappers

* Remove KernelMutex and KernelSharedMutex abstractions with std::mutex and std::shared_mutex

* Replaced unique_locks with lock_guards

* More changes

* Replace new and deletes with smart pointers

* Replaced some more with shared ptrs

* Replacements with smart pointers - pt 2

* missed change
This commit is contained in:
pghoshamd
2026-01-06 10:59:34 -05:00
zatwierdzone przez GitHub
rodzic e005f8487b
commit 637b0d71f0
34 zmienionych plików z 319 dodań i 452 usunięć
@@ -47,6 +47,7 @@
#include <assert.h>
#include <vector>
#include <mutex>
#include "core/inc/checked.h"
#include "core/inc/isa.h"
@@ -291,7 +292,7 @@ class Agent : public Checked<0xF6BC25EB17E6F917> {
void* value) const = 0;
// @brief Returns an array of regions owned by the agent.
virtual const std::vector<const core::MemoryRegion*>& regions() const = 0;
virtual const std::vector<std::shared_ptr<const core::MemoryRegion>>& regions() const = 0;
// @brief Returns the ISA's supported by the agent.
// @details The returned vector is a list of pointers to the supported ISA,
@@ -336,7 +337,7 @@ class Agent : public Checked<0xF6BC25EB17E6F917> {
__forceinline void Disable() { enabled_ = false; }
virtual void Trim() {
for (auto region : regions()) region->Trim();
for (const auto& region : regions()) region.get()->Trim();
}
virtual void ReleaseResources() { }
@@ -385,7 +386,7 @@ protected:
// Serial memory operations are needed to ensure, among other things, that allocation failures are
// due to true OOM conditions and per region caching (Trim and Allocate must be serial and
// exclusive to ensure this).
KernelMutex agent_memory_lock_;
std::mutex agent_memory_lock_;
// Forbid copying and moving of this object
DISALLOW_COPY_AND_ASSIGN(Agent);
@@ -82,7 +82,7 @@ public:
/// @brief Override from core::Agent.
const std::vector<const core::Isa*>& supported_isas() const override { return supported_isas_; }
const std::vector<const core::MemoryRegion*>& regions() const override { return regions_; }
const std::vector<std::shared_ptr<const core::MemoryRegion>>& regions() const override { return regions_; }
/// @brief Getter for the AIE system allocator.
const std::function<void*(size_t size, size_t align, core::MemoryRegion::AllocateFlags flags)>&
@@ -101,7 +101,7 @@ private:
/// @brief Setup the memory allocators used by this agent.
void InitAllocators();
std::vector<const core::MemoryRegion *> regions_;
std::vector<std::shared_ptr<const core::MemoryRegion>> regions_;
std::function<void *(size_t size, size_t align,
core::MemoryRegion::AllocateFlags flags)>
system_allocator_;
@@ -306,7 +306,7 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo
// GPU-visible indirect buffer holding PM4 commands.
void* pm4_ib_buf_;
uint32_t pm4_ib_size_b_;
KernelMutex pm4_ib_mutex_;
std::mutex pm4_ib_mutex_;
// Error handler control variable.
std::atomic<uint32_t> dynamicScratchState, exceptionState;
@@ -322,11 +322,11 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo
Signal* exception_signal_;
// CU mask lock
KernelMutex mask_lock_;
std::mutex mask_lock_;
// Mutex to prevent AsyncReclaimScratch and HandleInsufficientScratch from
// happening at the same time.
KernelMutex scratch_lock_;
std::mutex scratch_lock_;
// Current CU mask
std::vector<uint32_t> cu_mask_;
@@ -345,10 +345,10 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo
}
// Mutex for queue_event_ manipulation
KernelMutex& queue_lock() {
std::mutex& queue_lock() {
// This allocation is meant to last until the last thread has exited.
// It is intentionally not freed.
static KernelMutex* queue_lock_ = new KernelMutex();
static std::mutex* queue_lock_ = new std::mutex();
return *queue_lock_;
}
@@ -255,7 +255,7 @@ template <bool useGCR> class BlitSdma : public BlitSdmaBase {
// Internal signals for blocking APIs
core::unique_signal_ptr signals_[2];
KernelMutex lock_;
std::mutex lock_;
bool parity_;
/// Queue resource descriptor for doorbell, read
@@ -127,7 +127,7 @@ class CpuAgent : public core::Agent {
}
// @brief Override from core::Agent.
const std::vector<const core::MemoryRegion*>& regions() const override {
const std::vector<std::shared_ptr<const core::MemoryRegion>>& regions() const override {
return regions_;
}
@@ -151,7 +151,7 @@ class CpuAgent : public core::Agent {
// @retval ::HSA_STATUS_SUCCESS if the callback function for each traversed
// region returns ::HSA_STATUS_SUCCESS.
hsa_status_t VisitRegion(
const std::vector<const core::MemoryRegion*>& regions,
const std::vector<std::shared_ptr<const core::MemoryRegion>>& regions,
hsa_status_t (*callback)(hsa_region_t region, void* data),
void* data) const;
@@ -166,7 +166,7 @@ class CpuAgent : public core::Agent {
std::vector<std::unique_ptr<core::Cache>> caches_;
// @brief Array of regions owned by this agent.
std::vector<const core::MemoryRegion*> regions_;
std::vector<std::shared_ptr<const core::MemoryRegion>> regions_;
DISALLOW_COPY_AND_ASSIGN(CpuAgent);
};
@@ -394,7 +394,7 @@ class GpuAgent : public GpuAgentInt {
}
// @brief Override from core::Agent.
const std::vector<const core::MemoryRegion*>& regions() const override {
const std::vector<std::shared_ptr<const core::MemoryRegion>>& regions() const override {
return regions_;
}
@@ -536,7 +536,7 @@ class GpuAgent : public GpuAgentInt {
// @retval ::HSA_STATUS_SUCCESS if the callback function for each traversed
// region returns ::HSA_STATUS_SUCCESS.
hsa_status_t VisitRegion(
const std::vector<const core::MemoryRegion*>& regions,
const std::vector<std::shared_ptr<const core::MemoryRegion>>& regions,
hsa_status_t (*callback)(hsa_region_t region, void* data),
void* data) const;
@@ -594,7 +594,7 @@ class GpuAgent : public GpuAgentInt {
std::vector<const core::Agent*> xgmi_peer_list_;
// Protects xgmi_peer_list_
KernelMutex xgmi_peer_list_lock_;
std::mutex xgmi_peer_list_lock_;
// @brief AQL queues for cache management and blit compute usage.
enum QueueEnum {
@@ -607,19 +607,19 @@ class GpuAgent : public GpuAgentInt {
lazy_ptr<core::Queue> queues_[QueueCount];
// @brief Mutex to protect the update to coherency type.
KernelMutex coherency_lock_;
std::mutex coherency_lock_;
// @brief Mutex to protect access to scratch pool.
KernelMutex scratch_lock_;
std::mutex scratch_lock_;
// @brief Mutex to protect access to ::t1_.
KernelMutex t1_lock_;
std::mutex t1_lock_;
// @brief Mutex to protect access to blit objects.
KernelMutex blit_lock_;
std::mutex blit_lock_;
// @brief Mutex to protect sdma gang submissions.
KernelMutex sdma_gang_lock_;
std::mutex sdma_gang_lock_;
// @brief GPU tick on initialization.
HsaClockCounters t0_;
@@ -638,7 +638,7 @@ class GpuAgent : public GpuAgentInt {
std::vector<std::unique_ptr<core::Cache>> caches_;
// @brief Array of regions owned by this agent.
std::vector<const core::MemoryRegion*> regions_;
std::vector<std::shared_ptr<const core::MemoryRegion>> regions_;
core::Isa* isa_;
@@ -729,7 +729,7 @@ class GpuAgent : public GpuAgentInt {
struct {
lazy_ptr<core::Queue> queue_;
int ref_ct_;
KernelMutex lock_;
std::mutex lock_;
} gws_queue_;
// @brief list of AQL queues owned by this agent. Indexed by queue pointer
@@ -763,7 +763,7 @@ class GpuAgent : public GpuAgentInt {
/// @brief Coarse-grain deallocator on this GPU.
std::function<void(void*)> coarsegrain_deallocator_;
void* trap_handler_tma_region_;
std::unique_ptr<void, std::function<void(void*)>> trap_handler_tma_region_;
/* PC Sampling fields - begin */
/* 2nd level Trap handler code is based on the offsets within this structure */
@@ -181,7 +181,7 @@ namespace code {
std::vector<Segment*> dataSegments;
std::vector<Section*> dataSections;
std::vector<RelocationSection*> relocationSections;
std::vector<Symbol*> symbols;
std::vector<std::shared_ptr<Symbol>> symbols;
bool combineDataSegments;
Segment* hsaSegments[AMDGPU_HSA_SEGMENT_LAST][2];
Section* hsaSections[AMDGPU_HSA_SECTION_LAST];
@@ -234,7 +234,7 @@ namespace code {
uint32_t OsAbi() const { return img->OsAbi(); }
AmdHsaCode(bool combineDataSegments = true);
virtual ~AmdHsaCode();
virtual ~AmdHsaCode() = default;
std::string output() { return out.str(); }
bool LoadFromFile(const std::string& filename);
@@ -347,7 +347,7 @@ namespace code {
RelocationSection* GetRelocationSection(size_t i) { return relocationSections[i]; }
size_t SymbolCount() { return symbols.size(); }
Symbol* GetSymbol(size_t i) { return symbols[i]; }
Symbol* GetSymbol(size_t i) { return symbols[i].get(); }
Symbol* GetSymbolByElfIndex(size_t index);
Symbol* FindSymbol(const std::string &n);
@@ -362,11 +362,11 @@ namespace code {
class AmdHsaCodeManager {
private:
typedef std::unordered_map<uint64_t, AmdHsaCode*> CodeMap;
typedef std::unordered_map<uint64_t, std::shared_ptr<AmdHsaCode>> CodeMap;
CodeMap codeMap;
public:
AmdHsaCode* FromHandle(hsa_code_object_t handle);
const std::shared_ptr<AmdHsaCode>& FromHandle(hsa_code_object_t handle);
bool Destroy(hsa_code_object_t handle);
};
@@ -422,7 +422,7 @@ private:
Executable(const Executable &e);
Executable& operator=(const Executable &e);
static std::vector<Executable*> executables;
static std::vector<std::shared_ptr<Executable>> executables;
static std::mutex executables_mutex;
};
@@ -187,7 +187,7 @@ private:
// Protects against concurrent allow_access calls to fragments of the same block by virtue of all
// fragments of the block routing to the same MemoryRegion.
mutable KernelMutex access_lock_;
mutable std::mutex access_lock_;
static const size_t kPageSize_;
@@ -216,7 +216,7 @@ class InterceptQueue : public QueueProxy, private LocalSignal, public DoorbellSi
private:
// Serialize packet interception processing.
KernelMutex lock_;
std::mutex lock_;
// Largest processed packet index.
uint64_t next_packet_;
@@ -103,7 +103,7 @@ class IPCSignal : private SharedMemorySignal, public BusyWaitSignal {
static int rtti_id_ = 0;
return rtti_id_;
}
static KernelMutex lock_;
static std::mutex lock_;
explicit IPCSignal(SharedMemorySignal&& abi_block)
: SharedMemorySignal(std::move(abi_block)), BusyWaitSignal(signal(), true) {}
@@ -51,6 +51,7 @@
#include <tuple>
#include <utility>
#include <thread>
#include <shared_mutex>
#if defined(__linux__)
#include <sys/un.h>
#include <xf86drm.h>
@@ -437,15 +438,15 @@ class Runtime {
Agent* region_gpu() { return region_gpu_; }
const std::vector<const MemoryRegion*>& system_regions_fine() const {
const std::vector<std::shared_ptr<const MemoryRegion>>& system_regions_fine() const {
return system_regions_fine_;
}
const std::vector<const MemoryRegion*>& system_regions_coarse() const {
const std::vector<std::shared_ptr<const MemoryRegion>>& system_regions_coarse() const {
return system_regions_coarse_;
}
amd::hsa::loader::Loader* loader() { return loader_; }
amd::hsa::loader::Loader* loader() { return loader_.get(); }
amd::LoaderContext* loader_context() { return &loader_context_; }
@@ -719,10 +720,10 @@ class Runtime {
// Will be created before any user could call hsa_init but also could be
// destroyed before incorrectly written programs call hsa_shutdown.
static __forceinline KernelMutex& bootstrap_lock() {
static __forceinline std::mutex& bootstrap_lock() {
// This allocation is meant to last until the last thread has exited.
// It is intentionally not freed.
static KernelMutex* bootstrap_lock_ = new KernelMutex;
static std::mutex* bootstrap_lock_ = new std::mutex;
return *bootstrap_lock_;
}
Runtime();
@@ -780,7 +781,7 @@ class Runtime {
// Also ensures atomicity of pointer info queries by interlocking
// KFD map/unmap, register/unregister, and access to hsaKmtQueryPointerInfo
// registered & mapped arrays.
KernelSharedMutex memory_lock_;
std::shared_mutex memory_lock_;
// Array containing driver interfaces for compatible agent kernel-mode
// drivers. Currently supports AIE agents.
@@ -811,16 +812,16 @@ class Runtime {
std::vector<uint32_t> gpu_ids_;
// List of all fine grain system memory region in the platform.
std::vector<const MemoryRegion*> system_regions_fine_;
std::vector<std::shared_ptr<const MemoryRegion>> system_regions_fine_;
// List of all coarse grain system memory region in the platform.
std::vector<const MemoryRegion*> system_regions_coarse_;
std::vector<std::shared_ptr<const MemoryRegion>> system_regions_coarse_;
// Matrix of IO link.
std::vector<LinkInfo> link_matrix_;
// Loader instance.
amd::hsa::loader::Loader* loader_;
std::unique_ptr<amd::hsa::loader::Loader> loader_;
// Loader context.
amd::LoaderContext loader_context_;
@@ -832,7 +833,7 @@ class Runtime {
std::map<const void*, AllocationRegion> allocation_map_;
// Pending prefetch containers.
KernelMutex prefetch_lock_;
std::mutex prefetch_lock_;
prefetch_map_t prefetch_map_;
// Allocator using ::system_region_
@@ -853,24 +854,29 @@ class Runtime {
// Number of Numa Nodes
size_t num_nodes_;
struct HsaEventDeleter {
void operator()(HsaEvent* event) { InterruptSignal::DestroyEvent(event); }
};
using unique_hsa_event_ptr = std::unique_ptr<HsaEvent, HsaEventDeleter>;
// @brief AMD HSA event to monitor for virtual memory access fault.
HsaEvent* vm_fault_event_;
unique_hsa_event_ptr vm_fault_event_;
// @brief HSA signal to contain the VM fault event.
Signal* vm_fault_signal_;
unique_signal_ptr vm_fault_signal_;
// @brief AMD HSA event to monitor for HW exceptions.
HsaEvent* hw_exception_event_;
unique_hsa_event_ptr hw_exception_event_;
// @brief HSA signal to contain the HW exceptionevent.
Signal* hw_exception_signal_;
unique_signal_ptr hw_exception_signal_;
// Custom system event handlers.
std::vector<std::pair<AMD::callback_t<hsa_amd_system_event_callback_t>, void*>>
system_event_handlers_;
// System event handler lock
KernelMutex system_event_lock_;
std::mutex system_event_lock_;
// Internal queue creation notifier
AMD::callback_t<hsa_amd_runtime_queue_notifier> internal_queue_create_notifier_;
@@ -898,8 +904,8 @@ class Runtime {
// IPC DMA buf unix domain socket server dmabuf FD passing
int ipc_sock_server_fd_;
std::map<uint64_t, size_t> ipc_sock_server_conns_;
KernelMutex ipc_sock_server_lock_;
std::map<uint64_t, int> ipc_sock_server_conns_;
std::mutex ipc_sock_server_lock_;
private:
void CheckVirtualMemApiSupport();
@@ -50,6 +50,7 @@
#include <memory>
#include <vector>
#include <utility>
#include <mutex>
#include "hsakmt/hsakmt.h"
@@ -499,7 +500,7 @@ class Signal {
core::Agent* async_copy_agent_;
private:
static KernelMutex ipcLock_;
static std::mutex ipcLock_;
static std::map<decltype(hsa_signal_t::handle), Signal*> ipcMap_;
static Signal* lookupIpc(hsa_signal_t signal);
@@ -66,7 +66,6 @@ AieAgent::AieAgent(uint32_t node, const HsaNodeProperties& node_props)
}
AieAgent::~AieAgent() {
std::for_each(regions_.begin(), regions_.end(), DeleteObject());
regions_.clear();
}
@@ -75,8 +74,8 @@ hsa_status_t AieAgent::VisitRegion(bool include_peer,
void *data),
void *data) const {
AMD::callback_t<decltype(callback)> call(callback);
for (const auto r : regions_) {
hsa_region_t region_handle(core::MemoryRegion::Convert(r));
for (const auto& r : regions_) {
hsa_region_t region_handle(core::MemoryRegion::Convert(r.get()));
hsa_status_t err = call(region_handle, data);
if (err != HSA_STATUS_SUCCESS) {
return err;
@@ -321,24 +320,25 @@ void AieAgent::InitRegionList() {
/// explicit sync operations.
regions_.reserve(3);
regions_.push_back(
new MemoryRegion(false, true, false, false, true, this, sys_mem_props));
std::make_shared<MemoryRegion>(false, true, false, false, true, this, sys_mem_props));
regions_.push_back(
new MemoryRegion(false, false, false, false, true, this, dev_mem_props));
regions_.push_back(new MemoryRegion(false, false, false, false, true, this,
other_mem_props));
std::make_shared<MemoryRegion>(false, false, false, false, true, this, dev_mem_props));
regions_.push_back(
std::make_shared<MemoryRegion>(false, false, false, false, true, this, other_mem_props));
}
void AieAgent::InitAllocators() {
for (const auto *region : regions()) {
for (const auto& region : regions()) {
const MemoryRegion *amd_mem_region(
static_cast<const MemoryRegion *>(region));
static_cast<const MemoryRegion *>(region.get()));
if (amd_mem_region->kernarg()) {
const core::MemoryRegion* region_ptr = region.get();
system_allocator_ =
[region](size_t size, size_t align,
[region_ptr](size_t size, size_t align,
core::MemoryRegion::AllocateFlags alloc_flags) -> void * {
void *mem(nullptr);
return (core::Runtime::runtime_singleton_->AllocateMemory(
region, size, alloc_flags, &mem) == HSA_STATUS_SUCCESS)
region_ptr, size, alloc_flags, &mem) == HSA_STATUS_SUCCESS)
? mem
: nullptr;
};
@@ -165,8 +165,8 @@ AqlQueue::AqlQueue(core::SharedQueue* shared_queue, GpuAgent* agent, size_t req_
// Set group and private memory apertures in amd_queue_.
auto& regions = agent->regions();
for (auto region : regions) {
const MemoryRegion* amdregion = static_cast<const AMD::MemoryRegion*>(region);
for (const auto& region : regions) {
const MemoryRegion* amdregion = static_cast<const AMD::MemoryRegion*>(region.get());
uint64_t base = amdregion->GetBaseAddress();
if (amdregion->IsLDS()) {
@@ -217,7 +217,7 @@ AqlQueue::AqlQueue(core::SharedQueue* shared_queue, GpuAgent* agent, size_t req_
}
MAKE_NAMED_SCOPE_GUARD(EventGuard, [&]() {
ScopedAcquire<KernelMutex> _lock(&queue_lock());
std::lock_guard<std::mutex> _lock(queue_lock());
queue_count()--;
if (queue_count() == 0) {
core::InterruptSignal::DestroyEvent(queue_event());
@@ -232,7 +232,7 @@ AqlQueue::AqlQueue(core::SharedQueue* shared_queue, GpuAgent* agent, size_t req_
});
if (core::g_use_interrupt_wait) {
ScopedAcquire<KernelMutex> _lock(&queue_lock());
std::lock_guard<std::mutex> _lock(queue_lock());
queue_count()++;
if (queue_event() == nullptr) {
assert(queue_count() == 1 && "Inconsistency in queue event reference counting found.\n");
@@ -387,7 +387,7 @@ AqlQueue::~AqlQueue() {
FreeQueueMemory();
if (core::g_use_interrupt_wait) {
ScopedAcquire<KernelMutex> lock(&queue_lock());
std::lock_guard<std::mutex> lock(queue_lock());
queue_count()--;
if (queue_count() == 0) {
core::InterruptSignal::DestroyEvent(queue_event());
@@ -777,7 +777,7 @@ void AqlQueue::AsyncReclaimMainScratch() {
tool::notify_event_scratch_async_reclaim_start(public_handle(),
HSA_AMD_EVENT_SCRATCH_ALLOC_FLAG_NONE);
ScopedAcquire<KernelMutex> lock(&scratch_lock_);
std::lock_guard<std::mutex> lock(scratch_lock_);
// Unmap the queue. CP will check amd_queue_ fields on re-map
Suspend();
@@ -849,7 +849,7 @@ void AqlQueue::AsyncReclaimAltScratch() {
tool::notify_event_scratch_async_reclaim_start(public_handle(),
HSA_AMD_EVENT_SCRATCH_ALLOC_FLAG_ALT);
ScopedAcquire<KernelMutex> lock(&scratch_lock_);
std::lock_guard<std::mutex> lock(scratch_lock_);
// Unmap the queue. CP will check amd_queue_ fields on re-map
Suspend();
@@ -1014,7 +1014,7 @@ void AqlQueue::HandleInsufficientScratch(hsa_signal_value_t& error_code,
const uint64_t device_size = size_per_thread * lanes_per_wave * device_slots;
const uint64_t dispatch_size = size_per_thread * lanes_per_wave * dispatch_slots;
ScopedAcquire<KernelMutex> lock(&scratch_lock_);
std::lock_guard<std::mutex> lock(scratch_lock_);
// scratch.use_alt_limit will be 0 if alt scratch is not supported or disabled
if (dispatch_size < scratch.use_alt_limit && dispatch_slots < device_slots) {
@@ -1393,7 +1393,7 @@ hsa_status_t AqlQueue::SetCUMasking(uint32_t num_cu_mask_count, const uint32_t*
if ((mask.size() == mask_dwords) && (tail_mask != 0)) mask[mask_dwords - 1] &= tail_mask;
// Apply mask if non-default or not queue initialization.
ScopedAcquire<KernelMutex> lock(&mask_lock_);
std::lock_guard<std::mutex> lock(mask_lock_);
if ((!cu_mask_.empty()) || (num_cu_mask_count != 0) || (!global_mask.empty())) {
// Devices with WGPs must conform to even-indexed contiguous pairwise CU enablement.
@@ -1414,7 +1414,7 @@ hsa_status_t AqlQueue::SetCUMasking(uint32_t num_cu_mask_count, const uint32_t*
}
hsa_status_t AqlQueue::GetCUMasking(uint32_t num_cu_mask_count, uint32_t* cu_mask) {
ScopedAcquire<KernelMutex> lock(&mask_lock_);
std::lock_guard<std::mutex> lock(mask_lock_);
assert(!cu_mask_.empty() && "No current cu_mask!");
uint32_t user_dword_count = num_cu_mask_count / 32;
@@ -1440,7 +1440,7 @@ void AqlQueue::SetProfiling(bool enabled) {
void AqlQueue::ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b, hsa_fence_scope_t acquireFence,
hsa_fence_scope_t releaseFence, hsa_signal_t* in_signal) {
// pm4_ib_buf_ is a shared resource, so mutually exclude here.
ScopedAcquire<KernelMutex> lock(&pm4_ib_mutex_);
std::lock_guard<std::mutex> lock(pm4_ib_mutex_);
// Obtain reference to any container queue.
core::Queue* queue = core::Queue::Convert(public_handle());
@@ -293,7 +293,7 @@ static bool DepSignalCompleteHandler(hsa_signal_value_t signal_value, void *arg
template <bool useGCR>
hsa_status_t BlitSdma<useGCR>::SubmitBlockingCommand(const void* cmd, size_t cmd_size,
uint64_t size) {
ScopedAcquire<KernelMutex> lock(&lock_);
std::unique_lock<std::mutex> lock(lock_);
// Alternate between completion signals
// Using two allows overlapping command writing and copies
@@ -310,7 +310,7 @@ hsa_status_t BlitSdma<useGCR>::SubmitBlockingCommand(const void* cmd, size_t cmd
// Mark signal as in use, guard against exception leaving the signal in an unusable state.
completionSignal->StoreRelaxed(2);
MAKE_SCOPE_GUARD([&]() { completionSignal->StoreRelaxed(0); });
lock.Release();
lock.unlock();
std::vector<core::Signal*> gang_signals(0);
@@ -64,7 +64,6 @@ CpuAgent::CpuAgent(HSAuint32 node, const HsaNodeProperties& node_props,
}
CpuAgent::~CpuAgent() {
std::for_each(regions_.begin(), regions_.end(), DeleteObject());
regions_.clear();
}
@@ -87,17 +86,17 @@ void CpuAgent::InitRegionList() {
if (system_prop != mem_props.end()) system_props = *system_prop;
// Fine-Grain Memory
regions_.push_back(new MemoryRegion(true, false, is_apu_node, false, true, this, system_props));
regions_.push_back(std::make_shared<MemoryRegion>(true, false, is_apu_node, false, true, this, system_props));
// Ext-Fine-Grain Memory
regions_.push_back(new MemoryRegion(false, false, is_apu_node, true, true, this, system_props));
regions_.push_back(std::make_shared<MemoryRegion>(false, false, is_apu_node, true, true, this, system_props));
// Kernargs
regions_.push_back(new MemoryRegion(true, true, is_apu_node, false, true, this, system_props));
regions_.push_back(std::make_shared<MemoryRegion>(true, true, is_apu_node, false, true, this, system_props));
if (!is_apu_node) {
// Coarse Grain
regions_.push_back(new MemoryRegion(false, false, is_apu_node, false, true, this, system_props));
regions_.push_back(std::make_shared<MemoryRegion>(false, false, is_apu_node, false, true, this, system_props));
}
}
}
@@ -150,12 +149,12 @@ hsa_status_t CpuAgent::VisitRegion(bool include_peer,
}
hsa_status_t CpuAgent::VisitRegion(
const std::vector<const core::MemoryRegion*>& regions,
const std::vector<std::shared_ptr<const core::MemoryRegion>>& regions,
hsa_status_t (*callback)(hsa_region_t region, void* data),
void* data) const {
for (const core::MemoryRegion* region : regions) {
for (const std::shared_ptr<const rocr::core::MemoryRegion>& region : regions) {
if (!region->user_visible()) continue;
hsa_region_t region_handle = core::MemoryRegion::Convert(region);
hsa_region_t region_handle = core::MemoryRegion::Convert(region.get());
hsa_status_t status = callback(region_handle, data);
if (status != HSA_STATUS_SUCCESS) {
return status;
@@ -112,7 +112,9 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xna
scratch_limit_async_threshold_(0),
scratch_cache_(
[this](void* base, size_t size, bool large) { ReleaseScratch(base, size, large); }),
trap_handler_tma_region_(NULL),
trap_handler_tma_region_(nullptr, [this](void* ptr){
if (ptr && this->finegrain_allocator_) this->finegrain_deallocator()(ptr);
}),
rec_sdma_eng_override_(false),
pcs_hosttrap_data_(),
pcs_stochastic_data_(),
@@ -246,7 +248,6 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xna
GpuAgent::~GpuAgent() {
for (auto& blit : blits_) blit.reset();
std::for_each(regions_.begin(), regions_.end(), DeleteObject());
regions_.clear();
}
@@ -454,22 +455,20 @@ void GpuAgent::InitRegionList() {
memory_max_frequency_ = mem_props[mem_idx].MemoryClockMax;
case HSA_HEAPTYPE_GPU_LDS:
case HSA_HEAPTYPE_GPU_SCRATCH: {
MemoryRegion* region =
new MemoryRegion(false, false, false, false, true, this, mem_props[mem_idx]);
std::shared_ptr<MemoryRegion> region = std::make_shared<MemoryRegion>(false, false, false, false, true, this, mem_props[mem_idx]);
regions_.push_back(region);
if (region->IsLocalMemory()) {
// Extended Fine-Grain memory
if (!(isa_->GetMajorVersion() == 12 && isa_->GetMinorVersion() == 0))
regions_.push_back(
new MemoryRegion(false, false, false, true, true, this, mem_props[mem_idx]));
std::make_shared<MemoryRegion>(false, false, false, true, true, this, mem_props[mem_idx]));
// Expose VRAM as uncached/fine grain over PCIe (if enabled) or XGMI.
bool user_visible = (properties_.HiveID != 0) ||
core::Runtime::runtime_singleton_->flag().fine_grain_pcie();
regions_.push_back(new MemoryRegion(true, false, false, false, user_visible, this,
regions_.push_back(std::make_shared<MemoryRegion>(true, false, false, false, user_visible, this,
mem_props[mem_idx]));
}
break;
@@ -561,7 +560,7 @@ void GpuAgent::ReserveScratch()
size_t available;
hsa_status_t err = driver().AvailableMemory(node_id(), &available);
assert(err == HSA_STATUS_SUCCESS && "AvailableMemory failed");
ScopedAcquire<KernelMutex> lock(&scratch_lock_);
std::lock_guard<std::mutex> lock(scratch_lock_);
if (!scratch_cache_.reserved_bytes() && reserved_sz && available > 8 * reserved_sz) {
HSAuint64 alt_va;
void* reserved_base = scratch_pool_.alloc(reserved_sz);
@@ -676,20 +675,20 @@ hsa_status_t GpuAgent::VisitRegion(bool include_peer,
}
hsa_status_t GpuAgent::VisitRegion(
const std::vector<const core::MemoryRegion*>& regions,
const std::vector<std::shared_ptr<const core::MemoryRegion>>& regions,
hsa_status_t (*callback)(hsa_region_t region, void* data),
void* data) const {
AMD::callback_t<decltype(callback)> call(callback);
for (const core::MemoryRegion* region : regions) {
for (const auto& region : regions) {
if (!region->user_visible()) continue;
const AMD::MemoryRegion* amd_region =
reinterpret_cast<const AMD::MemoryRegion*>(region);
reinterpret_cast<const AMD::MemoryRegion*>(region.get());
// Only expose system, local, and LDS memory.
if (amd_region->IsSystem() || amd_region->IsLocalMemory() ||
amd_region->IsLDS()) {
hsa_region_t region_handle = core::MemoryRegion::Convert(region);
hsa_region_t region_handle = core::MemoryRegion::Convert(region.get());
hsa_status_t status = call(region_handle, data);
if (status != HSA_STATUS_SUCCESS) {
return status;
@@ -910,7 +909,7 @@ void GpuAgent::InitGWS() {
}
void GpuAgent::GWSRelease() {
ScopedAcquire<KernelMutex> lock(&gws_queue_.lock_);
std::lock_guard<std::mutex> lock(gws_queue_.lock_);
gws_queue_.ref_ct_--;
if (gws_queue_.ref_ct_ != 0) return;
InitGWS();
@@ -968,22 +967,22 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, const void* src, size_t size) {
}
void GpuAgent::SetCopyRequestRefCount(bool set) {
ScopedAcquire<KernelMutex> lock(&blit_lock_);
std::unique_lock<std::mutex> lock(blit_lock_);
while (pending_copy_stat_check_ref_) {
blit_lock_.Release();
lock.unlock();
os::YieldThread();
blit_lock_.Acquire();
lock.lock();
}
if (!set && pending_copy_req_ref_) pending_copy_req_ref_--;
else pending_copy_req_ref_++;
}
void GpuAgent::SetCopyStatusCheckRefCount(bool set) {
ScopedAcquire<KernelMutex> lock(&blit_lock_);
std::unique_lock<std::mutex> lock(blit_lock_);
while (pending_copy_req_ref_) {
blit_lock_.Release();
lock.unlock();
os::YieldThread();
blit_lock_.Acquire();
lock.lock();
}
if (!set && pending_copy_stat_check_ref_) pending_copy_stat_check_ref_--;
else pending_copy_stat_check_ref_++;
@@ -1059,7 +1058,7 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent,
std::min(gang_factor, properties_.NumSdmaXgmiEngines);
}
ScopedAcquire<KernelMutex> lock(&sdma_gang_lock_);
std::lock_guard<std::mutex> lock(sdma_gang_lock_);
// Manage internal gang signals
std::vector<core::Signal*> gang_signals;
if (gang_factor > 1) {
@@ -1642,7 +1641,7 @@ hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const {
if (status != HSA_STATUS_SUCCESS) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
for (auto r : regions()) availableBytes += ((AMD::MemoryRegion*)r)->GetCacheSize();
for (const auto& r : regions()) availableBytes += ((AMD::MemoryRegion*)(r.get()))->GetCacheSize();
availableBytes += scratch_cache_.free_bytes() - scratch_cache_.reserved_bytes();
@@ -1730,7 +1729,7 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type, u
core::Queue** queue) {
// Handle GWS queues.
if (queue_type == HSA_QUEUE_TYPE_COOPERATIVE) {
ScopedAcquire<KernelMutex> lock(&gws_queue_.lock_);
std::lock_guard<std::mutex> lock(gws_queue_.lock_);
auto ret = (*gws_queue_.queue_).get();
if (ret != nullptr) {
gws_queue_.ref_ct_++;
@@ -1876,7 +1875,7 @@ void GpuAgent::AcquireQueueMainScratch(ScratchInfo& scratch) {
*/
bool large;
ScopedAcquire<KernelMutex> lock(&scratch_lock_);
std::lock_guard<std::mutex> lock(scratch_lock_);
const size_t small_limit = scratch_pool_.size() >> 3;
bool use_reclaim = true;
@@ -2035,7 +2034,7 @@ void GpuAgent::AcquireQueueAltScratch(ScratchInfo& scratch) {
uint64_t size_per_wave = AlignUp(scratch.alt_size_per_thread * properties_.WaveFrontSize, 1024);
if (size_per_wave > MAX_WAVE_SCRATCH) return;
ScopedAcquire<KernelMutex> lock(&scratch_lock_);
std::lock_guard<std::mutex> lock(scratch_lock_);
// Ensure mapping will be in whole pages.
scratch.alt_size = AlignUp(scratch.alt_size, 4096);
@@ -2176,7 +2175,7 @@ uint64_t GpuAgent::TranslateTime(uint64_t tick) {
// Limit errors due to relative frequency drift to ~0.5us. Sync clocks at 16Hz.
const int64_t max_extrapolation = core::Runtime::runtime_singleton_->sys_clock_freq() >> 4;
ScopedAcquire<KernelMutex> lock(&t1_lock_);
std::lock_guard<std::mutex> lock(t1_lock_);
// Limit errors due to correlated pair certainty to ~0.5us.
// extrapolated time < (0.5us / half clock read certainty) * delay between clock measures
// clock read certainty is <4us.
@@ -2261,26 +2260,27 @@ hsa_status_t GpuAgent::UpdateTrapHandlerWithPCS(pcs_sampling_data_t* pcs_hosttra
((uint64_t*)tma_region_host)[1] = (uint64_t)pcs_stochastic_buffers;
if (!trap_handler_tma_region_) {
trap_handler_tma_region_ = (uint64_t*)finegrain_allocator()(2 * sizeof(uint64_t), 0);
if (trap_handler_tma_region_ == nullptr) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
void* mem = (uint64_t*)finegrain_allocator()(2 * sizeof(uint64_t), 0);
if (!mem) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
trap_handler_tma_region_.reset(mem);
// NearestCpuAgent owns pool returned system_allocator()
auto cpuAgent = GetNearestCpuAgent()->public_handle();
hsa_status_t ret =
AMD::hsa_amd_agents_allow_access(1, &cpuAgent, NULL, trap_handler_tma_region_);
AMD::hsa_amd_agents_allow_access(1, &cpuAgent, NULL, trap_handler_tma_region_.get());
assert(ret == HSA_STATUS_SUCCESS);
}
/* On non-large BAR systems, we may not be able to access device memory, so do a DmaCopy */
if (DmaCopy(trap_handler_tma_region_, tma_region_host, 2 * sizeof(uint64_t)) != HSA_STATUS_SUCCESS)
if (DmaCopy(trap_handler_tma_region_.get(), tma_region_host, 2 * sizeof(uint64_t)) != HSA_STATUS_SUCCESS)
return HSA_STATUS_ERROR;
tma_size = 2 * sizeof(uint64_t);
tma_addr = trap_handler_tma_region_;
tma_addr = trap_handler_tma_region_.get();
} else if (trap_handler_tma_region_) {
finegrain_deallocator()(trap_handler_tma_region_);
trap_handler_tma_region_ = NULL;
trap_handler_tma_region_.reset(nullptr);
}
// Bind the trap handler to this node.
@@ -2398,7 +2398,7 @@ lazy_ptr<core::Blit>& GpuAgent::GetXgmiBlit(const core::Agent& dst_agent) {
uint32_t xgmi_engine_cnt = properties_.NumSdmaXgmiEngines;
assert((xgmi_engine_cnt > 0) && ("Illegal condition, should not happen"));
ScopedAcquire<KernelMutex> lock(&xgmi_peer_list_lock_);
std::lock_guard<std::mutex> lock(xgmi_peer_list_lock_);
for (uint32_t idx = 0; idx < xgmi_peer_list_.size(); idx++) {
uint64_t dst_handle = dst_agent.public_handle().handle;
@@ -2490,19 +2490,20 @@ lazy_ptr<core::Blit>& GpuAgent::GetBlitObject(const core::Agent& dst_agent,
void GpuAgent::Trim() {
Agent::Trim();
AsyncReclaimScratchQueues();
ScopedAcquire<KernelMutex> lock(&scratch_lock_);
std::lock_guard<std::mutex> lock(scratch_lock_);
scratch_cache_.trim(false);
}
void GpuAgent::InitAllocators() {
for (auto pool : GetNearestCpuAgent()->regions()) {
for (const auto& pool : GetNearestCpuAgent()->regions()) {
if (pool->kernarg()) {
system_allocator_ = [pool](size_t size, size_t alignment,
const core::MemoryRegion* pool_ptr = pool.get();
system_allocator_ = [pool_ptr](size_t size, size_t alignment,
MemoryRegion::AllocateFlags alloc_flags) -> void* {
assert(alignment <= 4096);
void* ptr = nullptr;
return (HSA_STATUS_SUCCESS ==
core::Runtime::runtime_singleton_->AllocateMemory(pool, size, alloc_flags, &ptr))
core::Runtime::runtime_singleton_->AllocateMemory(pool_ptr, size, alloc_flags, &ptr))
? ptr
: nullptr;
};
@@ -2513,14 +2514,14 @@ void GpuAgent::InitAllocators() {
assert(system_allocator_ && "Nearest NUMA node did not have a kernarg pool.");
// Setup this GPU's fine-grain and coarse-grain allocators.
for (auto region : regions()) {
const AMD::MemoryRegion* amd_region = static_cast<const AMD::MemoryRegion*>(region);
for (const auto& region : regions()) {
const AMD::MemoryRegion* amd_region = static_cast<const AMD::MemoryRegion*>(region.get());
auto region_allocator = [region](size_t size,
auto region_allocator = [amd_region](size_t size,
MemoryRegion::AllocateFlags alloc_flags) -> void* {
void* ptr = nullptr;
return (HSA_STATUS_SUCCESS ==
core::Runtime::runtime_singleton_->AllocateMemory(region, size, alloc_flags, &ptr))
core::Runtime::runtime_singleton_->AllocateMemory(amd_region, size, alloc_flags, &ptr))
? ptr
: nullptr;
};
@@ -283,18 +283,18 @@ const core::MemoryRegion* RegionMemory::AgentLocal(hsa_agent_t agent, bool is_co
assert(amd_agent->device_type() == core::Agent::kAmdGpuDevice && "Invalid agent type.");
auto agent_local_region =
std::find_if(amd_agent->regions().begin(), amd_agent->regions().end(),
[&](const core::MemoryRegion* region) {
const AMD::MemoryRegion* amd_region = (const AMD::MemoryRegion*)region;
[&](const std::shared_ptr<const core::MemoryRegion>& region) {
const AMD::MemoryRegion* amd_region = (const AMD::MemoryRegion*)region.get();
return amd_region->IsLocalMemory() && (!amd_region->fine_grain());
});
return agent_local_region == amd_agent->regions().end() ? nullptr : *agent_local_region;
return agent_local_region == amd_agent->regions().end() ? nullptr : agent_local_region->get();
}
const core::MemoryRegion* RegionMemory::System(bool is_code) {
if (is_code)
return core::Runtime::runtime_singleton_->system_regions_coarse()[0];
return core::Runtime::runtime_singleton_->system_regions_coarse()[0].get();
else
return core::Runtime::runtime_singleton_->system_regions_fine()[0];
return core::Runtime::runtime_singleton_->system_regions_fine()[0].get();
}
bool RegionMemory::Allocate(size_t size, size_t align, bool zero) {
@@ -48,6 +48,8 @@
#include "core/inc/amd_memory_region.h"
#include <algorithm>
#include <mutex>
#include <shared_mutex>
#include "core/inc/runtime.h"
#include "core/inc/amd_cpu_agent.h"
@@ -132,7 +134,7 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile,
MemoryRegion::~MemoryRegion() {}
hsa_status_t MemoryRegion::Allocate(size_t& size, AllocateFlags alloc_flags, void** address, int agent_node_id) const {
ScopedAcquire<KernelMutex> lock(&owner()->agent_memory_lock_);
std::lock_guard<std::mutex> lock(owner()->agent_memory_lock_);
return AllocateImpl(size, alloc_flags, address, agent_node_id);
}
@@ -160,7 +162,7 @@ hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags,
}
hsa_status_t MemoryRegion::Free(void* address, size_t size) const {
ScopedAcquire<KernelMutex> lock(&owner()->agent_memory_lock_);
std::lock_guard<std::mutex> lock(owner()->agent_memory_lock_);
return FreeImpl(address, size);
}
@@ -172,7 +174,7 @@ hsa_status_t MemoryRegion::FreeImpl(void* address, size_t size) const {
// TODO: Look into a better name and/or making this process transparent to exporting.
hsa_status_t MemoryRegion::IPCFragmentExport(void* address) const {
ScopedAcquire<KernelMutex> lock(&owner()->agent_memory_lock_);
std::lock_guard<std::mutex> lock(owner()->agent_memory_lock_);
if (!fragment_allocator_.discardBlock(address)) return HSA_STATUS_ERROR_INVALID_ALLOCATION;
return HSA_STATUS_SUCCESS;
}
@@ -448,7 +450,7 @@ hsa_status_t MemoryRegion::AllowAccess(uint32_t num_agents,
std::vector<uint64_t> union_agents;
info.size = sizeof(info);
ScopedAcquire<KernelMutex> lock(&access_lock_);
std::lock_guard<std::mutex> lock(access_lock_);
if (core::Runtime::runtime_singleton_->PtrInfo(const_cast<void*>(ptr), &info, malloc,
&agent_count, &accessible,
@@ -512,8 +514,7 @@ hsa_status_t MemoryRegion::AllowAccess(uint32_t num_agents,
{ // Sequence with pointer info since queries to other fragments of the block may be adjusted by
// this call.
ScopedAcquire<KernelSharedMutex::Shared> lock(
core::Runtime::runtime_singleton_->memory_lock_.shared());
std::shared_lock<std::shared_mutex> lock(core::Runtime::runtime_singleton_->memory_lock_);
uint64_t alternate_va = 0;
if (owner()->driver().MakeMemoryResident(ptr, size, &alternate_va, &map_flag,
whitelist_nodes.size(),
@@ -1804,7 +1804,7 @@ hsa_status_t hsa_code_object_serialize(
IS_BAD_PTR(serialized_code_object);
IS_BAD_PTR(serialized_code_object_size);
amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object);
amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object).get();
if (!code) {
return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
}
@@ -1982,7 +1982,7 @@ hsa_status_t hsa_code_object_get_info(
IS_OPEN();
IS_BAD_PTR(value);
amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object);
amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object).get();
if (!code) {
return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
}
@@ -2039,7 +2039,7 @@ hsa_status_t hsa_code_object_get_symbol(
IS_BAD_PTR(symbol_name);
IS_BAD_PTR(symbol);
amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object);
amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object).get();
if (!code) {
return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
}
@@ -2059,7 +2059,7 @@ hsa_status_t hsa_code_object_get_symbol_from_name(
IS_BAD_PTR(symbol_name);
IS_BAD_PTR(symbol);
amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object);
amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object).get();
if (!code) {
return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
}
@@ -2097,7 +2097,7 @@ hsa_status_t hsa_code_object_iterate_symbols(
IS_OPEN();
IS_BAD_PTR(callback);
amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object);
amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object).get();
if (!code) {
return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
}
@@ -759,7 +759,7 @@ hsa_status_t hsa_amd_memory_lock(void* host_ptr, size_t size,
}
const AMD::MemoryRegion* system_region = static_cast<const AMD::MemoryRegion*>(
core::Runtime::runtime_singleton_->system_regions_coarse()[0]);
core::Runtime::runtime_singleton_->system_regions_coarse()[0].get());
return system_region->Lock(num_agent, agents, host_ptr, size, 0, agent_ptr);
CATCH;
@@ -799,7 +799,7 @@ hsa_status_t hsa_amd_memory_unlock(void* host_ptr) {
const AMD::MemoryRegion* system_region =
reinterpret_cast<const AMD::MemoryRegion*>(
core::Runtime::runtime_singleton_->system_regions_fine()[0]);
core::Runtime::runtime_singleton_->system_regions_fine()[0].get());
return system_region->Unlock(host_ptr);
CATCH;
@@ -340,7 +340,7 @@ void InterceptQueue::StoreRelaxed(hsa_signal_value_t value) {
return;
}
ScopedAcquire<KernelMutex> lock(&lock_);
std::lock_guard<std::mutex> lock(lock_);
// Submit overflow packets.
if (!overflow_.empty()) {
@@ -48,7 +48,7 @@ namespace rocr {
namespace core {
HsaEvent* InterruptSignal::EventPool::alloc() {
ScopedAcquire<HybridMutex> lock(&lock_);
std::lock_guard<HybridMutex> lock(lock_);
if (events_.empty()) {
if (!allEventsAllocated) {
HsaEvent* evt = InterruptSignal::CreateEvent(HSA_EVENTTYPE_SIGNAL, false);
@@ -64,7 +64,7 @@ HsaEvent* InterruptSignal::EventPool::alloc() {
void InterruptSignal::EventPool::free(HsaEvent* evt) {
if (evt == nullptr) return;
ScopedAcquire<HybridMutex> lock(&lock_);
std::lock_guard<HybridMutex> lock(lock_);
events_.push_back(unique_event_ptr(evt));
}
@@ -50,7 +50,7 @@
namespace rocr {
namespace core {
KernelMutex IPCSignal::lock_;
std::mutex IPCSignal::lock_;
SharedMemory::SharedMemory(const hsa_amd_ipc_memory_t* handle, size_t len) {
hsa_status_t err = Runtime::runtime_singleton_->IPCAttach(handle, len, 0, NULL, &ptr_);
@@ -85,7 +85,7 @@ Signal* IPCSignal::Attach(const hsa_amd_ipc_signal_t* ipc_signal_handle) {
hsa_signal_t handle = SharedSignal::Convert(shared.signal());
ScopedAcquire<KernelMutex> lock(&lock_);
std::lock_guard<std::mutex> lock(lock_);
Signal* ret = core::Signal::DuplicateHandle(handle);
if (ret == nullptr) ret = new IPCSignal(std::move(shared));
return ret;
@@ -48,6 +48,7 @@
#include <string>
#include <vector>
#include <list>
#include <shared_mutex>
#if defined(__linux__)
#include <link.h>
#include <dlfcn.h>
@@ -119,7 +120,7 @@ bool g_use_mwaitx;
Runtime* Runtime::runtime_singleton_ = NULL;
hsa_status_t Runtime::Acquire() {
ScopedAcquire<KernelMutex> boot(&bootstrap_lock());
std::lock_guard<std::mutex> boot(bootstrap_lock());
if (runtime_singleton_ == NULL) {
memset(log_flags, 0, sizeof(log_flags));
@@ -146,7 +147,7 @@ hsa_status_t Runtime::Acquire() {
}
hsa_status_t Runtime::Release() {
ScopedAcquire<KernelMutex> boot(&bootstrap_lock());
std::lock_guard<std::mutex> boot(bootstrap_lock());
if (runtime_singleton_ == nullptr) return HSA_STATUS_ERROR_NOT_INITIALIZED;
@@ -192,7 +193,7 @@ void Runtime::RegisterAgent(Agent* agent, bool Enabled) {
agents_by_gpuid_[0] = agent;
// Add cpu regions to the system region list.
for (const core::MemoryRegion* region : agent->regions()) {
for (auto region : agent->regions()) {
if (region->fine_grain()) {
system_regions_fine_.push_back(region);
} else {
@@ -216,7 +217,7 @@ void Runtime::RegisterAgent(Agent* agent, bool Enabled) {
assert(alignment <= 4096);
void* ptr = NULL;
return (HSA_STATUS_SUCCESS ==
core::Runtime::runtime_singleton_->AllocateMemory(pool, size, alloc_flags,
core::Runtime::runtime_singleton_->AllocateMemory(pool.get(), size, alloc_flags,
&ptr, agent_node_id))
? ptr
: NULL;
@@ -336,7 +337,7 @@ hsa_status_t Runtime::AllocateMemory(const MemoryRegion* region, size_t size,
hsa_status_t status = region->Allocate(size, alloc_flags, address, agent_node_id);
// Track the allocation result so that it could be freed properly.
if (status == HSA_STATUS_SUCCESS) {
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
std::lock_guard<std::shared_mutex> lock(memory_lock_);
allocation_map_[*address] = AllocationRegion(region, size, size_requested, alloc_flags);
}
@@ -354,7 +355,7 @@ hsa_status_t Runtime::FreeMemory(void* ptr) {
MemoryRegion::AllocateFlags alloc_flags = core::MemoryRegion::AllocateNoFlags;
{
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
std::lock_guard<std::shared_mutex> lock(memory_lock_);
std::map<const void*, AllocationRegion>::iterator it = allocation_map_.find(ptr);
@@ -458,7 +459,7 @@ hsa_status_t Runtime::FreeMemory(void* ptr) {
hsa_status_t Runtime::RegisterReleaseNotifier(void* ptr, hsa_amd_deallocation_callback_t callback,
void* user_data) {
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
std::lock_guard<std::shared_mutex> lock(memory_lock_);
auto mem = allocation_map_.upper_bound(ptr);
if (mem != allocation_map_.begin()) {
mem--;
@@ -482,7 +483,7 @@ hsa_status_t Runtime::RegisterReleaseNotifier(void* ptr, hsa_amd_deallocation_ca
hsa_status_t Runtime::DeregisterReleaseNotifier(void* ptr,
hsa_amd_deallocation_callback_t callback) {
hsa_status_t ret = HSA_STATUS_ERROR_INVALID_ARGUMENT;
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
std::lock_guard<std::shared_mutex> lock(memory_lock_);
auto mem = allocation_map_.upper_bound(ptr);
if (mem != allocation_map_.begin()) {
mem--;
@@ -552,7 +553,7 @@ hsa_status_t Runtime::CopyMemory(void* dst, const void* src, size_t size) {
// GPU-CPU
// Must ensure that system memory is visible to the GPU during the copy.
const AMD::MemoryRegion* system_region =
static_cast<const AMD::MemoryRegion*>(system_regions_fine_[0]);
static_cast<const AMD::MemoryRegion*>(system_regions_fine_[0].get());
void* gpuPtr = nullptr;
const auto& locked_copy = [&](void*& ptr, core::Agent* locking_agent) {
@@ -698,7 +699,7 @@ hsa_status_t Runtime::AllowAccess(uint32_t num_agents,
size_t alloc_size = 0;
{
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
std::lock_guard<std::shared_mutex> lock(memory_lock_);
std::map<const void*, AllocationRegion>::const_iterator it = allocation_map_.find(ptr);
@@ -929,7 +930,7 @@ hsa_status_t Runtime::InteropMap(uint32_t num_agents, Agent** agents,
*size = info.SizeInBytes;
*ptr = info.MemoryAddress;
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
std::lock_guard<std::shared_mutex> lock(memory_lock_);
allocation_map_[info.MemoryAddress] = AllocationRegion(
nullptr, info.SizeInBytes, info.SizeInBytes, core::MemoryRegion::AllocateNoFlags);
@@ -1055,7 +1056,7 @@ hsa_status_t Runtime::PtrInfo(const void* ptr, hsa_amd_pointer_info_t* info, voi
{ // memory_lock protects access to the NMappedNodes array and fragment user data since these may
// change with calls to memory APIs.
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
std::lock_guard<std::shared_mutex> lock(memory_lock_);
if (VMemoryPtrInfo(ptr, &retInfo, alloc, num_agents_accessible, accessible) ==
HSA_STATUS_SUCCESS) {
@@ -1196,7 +1197,7 @@ hsa_status_t Runtime::PtrInfo(const void* ptr, hsa_amd_pointer_info_t* info, voi
hsa_status_t Runtime::SetPtrInfoData(const void* ptr, void* userptr) {
{ // Use allocation map if possible to handle fragments.
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
std::lock_guard<std::shared_mutex> lock(memory_lock_);
const auto& it = allocation_map_.find(ptr);
if (it != allocation_map_.end()) {
it->second.user_ptr = userptr;
@@ -1307,7 +1308,7 @@ void Runtime::AsyncIPCSockServerConnLoop(void*) {
size_t len = 0;
// Search for registered export pointer
ScopedAcquire<KernelMutex> lock(&ipc_sock_server_lock_);
std::lock_guard<std::mutex> lock(ipc_sock_server_lock_);
for (auto& conns : ipc_sock_server_conns_) {
if (conn_handle == conns.first) {
ptr = reinterpret_cast<void *>(conn_handle);
@@ -1372,7 +1373,7 @@ hsa_status_t Runtime::IPCCreate(void* ptr, size_t len, hsa_amd_ipc_memory_t* han
if (useFrag) {
handle->handle[6] |= 0x80000000 | fragOffset;
// Prevent realloction of fragment for better performance.
ScopedAcquire<KernelSharedMutex::Shared> lock(memory_lock_.shared());
std::shared_lock<std::shared_mutex> lock(memory_lock_);
err = allocation_map_[ptr].region->IPCFragmentExport(ptr);
assert(err == HSA_STATUS_SUCCESS && "Region inconsistent with address map.");
}
@@ -1439,7 +1440,7 @@ hsa_status_t Runtime::IPCCreate(void* ptr, size_t len, hsa_amd_ipc_memory_t* han
close(dmabuf_fd);
ScopedAcquire<KernelMutex> lock(&ipc_sock_server_lock_);
std::lock_guard<std::mutex> lock(ipc_sock_server_lock_);
#if defined(__linux__)
if (!ipc_sock_server_conns_.size()) { // create new runtime socket server
struct sockaddr_un address;
@@ -1549,7 +1550,7 @@ int Runtime::IPCClientImport(uint32_t conn_handle, uint64_t dmabuf_fd_handle,
// Store the buffer object handle in allocation map for later use
if (err == HSAKMT_STATUS_SUCCESS) {
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
std::lock_guard<std::shared_mutex> lock(memory_lock_);
allocation_map_[*importAddress] =
AllocationRegion(nullptr, *importSize, *importSize, core::MemoryRegion::AllocateNoFlags);
allocation_map_[*importAddress].ldrm_bo = res.buf_handle;
@@ -1579,7 +1580,7 @@ hsa_status_t Runtime::IPCAttach(const hsa_amd_ipc_memory_t* handle, size_t len,
importAddress = reinterpret_cast<uint8_t*>(importAddress) + fragOffset;
len = Min(len, importSize - fragOffset);
}
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
std::lock_guard<std::shared_mutex> lock(memory_lock_);
allocation_map_[importAddress] =
AllocationRegion(nullptr, len, len, core::MemoryRegion::AllocateNoFlags);
allocation_map_[importAddress].ldrm_bo = ldrm_bo;
@@ -1711,7 +1712,7 @@ hsa_status_t Runtime::IPCAttach(const hsa_amd_ipc_memory_t* handle, size_t len,
hsa_status_t Runtime::IPCDetach(void* ptr) {
bool ldrmImportCleaned = false;
{ // Handle imported fragments.
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
std::unique_lock<std::shared_mutex> lock(memory_lock_);
const auto& it = allocation_map_.find(ptr);
if (it != allocation_map_.end()) {
if (it->second.region != nullptr) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
@@ -1728,7 +1729,7 @@ hsa_status_t Runtime::IPCDetach(void* ptr) {
assert(!"Unimplemented!");
#endif
allocation_map_.erase(it);
lock.Release(); // Can't hold memory lock when using pointer info.
lock.unlock(); // Can't hold memory lock when using pointer info.
PtrInfoBlockData block = {};
hsa_amd_pointer_info_t info = {};
@@ -1954,7 +1955,7 @@ void Runtime::AsyncEventsPool::clear() {
}
Runtime::AsyncEventItem* Runtime::AsyncEventsPool::alloc() {
ScopedAcquire<HybridMutex> lock(&lock_);
std::lock_guard<HybridMutex> lock(lock_);
if (free_list_.empty()) {
AsyncEventItem* block = reinterpret_cast<AsyncEventItem*>(
allocate_()(block_size_ * sizeof(AsyncEventItem), __alignof(AsyncEventItem), core::MemoryRegion::AllocateNonPaged, 0));
@@ -1985,7 +1986,7 @@ void Runtime::AsyncEventsPool::free(AsyncEventItem* ptr) {
if (ptr == nullptr) return;
ptr->~AsyncEventItem();
ScopedAcquire<HybridMutex> lock(&lock_);
std::lock_guard<HybridMutex> lock(lock_);
ifdebug {
bool valid = false;
@@ -2059,33 +2060,33 @@ void Runtime::BindErrorHandlers() {
// Create memory event with manual reset to avoid racing condition
// with driver in case of multiple concurrent VM faults.
vm_fault_event_ = core::InterruptSignal::CreateEvent(HSA_EVENTTYPE_MEMORY, true);
vm_fault_event_.reset(core::InterruptSignal::CreateEvent(HSA_EVENTTYPE_MEMORY, true));
// Create an interrupt signal object to contain the memory event.
// This signal object will be registered with the async handler global
// thread.
vm_fault_signal_ = new core::InterruptSignal(0, vm_fault_event_);
vm_fault_signal_.reset(new core::InterruptSignal(0, vm_fault_event_.get()));
if (!vm_fault_signal_->IsValid() || vm_fault_signal_->EopEvent() == NULL) {
assert(false && "Failed on creating VM fault signal");
return;
}
SetAsyncSignalHandler(core::Signal::Convert(vm_fault_signal_), HSA_SIGNAL_CONDITION_NE, 0,
VMFaultHandler, reinterpret_cast<void*>(vm_fault_signal_));
SetAsyncSignalHandler(core::Signal::Convert(vm_fault_signal_.get()), HSA_SIGNAL_CONDITION_NE, 0,
VMFaultHandler, reinterpret_cast<void*>(vm_fault_signal_.get()));
// Create HW exception event which is for Non-RAS events
hw_exception_event_ = core::InterruptSignal::CreateEvent(HSA_EVENTTYPE_HW_EXCEPTION, true);
hw_exception_event_.reset(core::InterruptSignal::CreateEvent(HSA_EVENTTYPE_HW_EXCEPTION, true));
hw_exception_signal_ = new core::InterruptSignal(0, hw_exception_event_);
hw_exception_signal_.reset(new core::InterruptSignal(0, hw_exception_event_.get()));
if (!hw_exception_signal_->IsValid() || hw_exception_signal_->EopEvent() == NULL) {
assert(false && "Failed on creating HW Exception signal");
return;
}
SetAsyncSignalHandler(core::Signal::Convert(hw_exception_signal_), HSA_SIGNAL_CONDITION_NE, 0,
HwExceptionHandler, reinterpret_cast<void*>(hw_exception_signal_));
SetAsyncSignalHandler(core::Signal::Convert(hw_exception_signal_.get()), HSA_SIGNAL_CONDITION_NE, 0,
HwExceptionHandler, reinterpret_cast<void*>(hw_exception_signal_.get()));
}
bool Runtime::HwExceptionHandler(hsa_signal_value_t val, void* arg) {
@@ -2262,7 +2263,8 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) {
}
void Runtime::PrintMemoryMapNear(void* ptr) {
runtime_singleton_->memory_lock_.Acquire();
std::unique_lock<std::shared_mutex> lock(runtime_singleton_->memory_lock_);
auto it = runtime_singleton_->allocation_map_.upper_bound(ptr);
for (int i = 0; i < 2; i++) {
if (it != runtime_singleton_->allocation_map_.begin()) it--;
@@ -2287,8 +2289,9 @@ void Runtime::PrintMemoryMapNear(void* ptr) {
it++;
}
fprintf(stderr, "\n");
it = start;
runtime_singleton_->memory_lock_.Release();
it = start;
lock.unlock();
hsa_amd_pointer_info_t info = {};
PtrInfoBlockData block = {};
uint32_t count = 0;
@@ -2408,7 +2411,7 @@ hsa_status_t Runtime::Load() {
BindErrorHandlers();
loader_ = amd::hsa::loader::Loader::Create(&loader_context_);
loader_.reset(amd::hsa::loader::Loader::Create(&loader_context_));
// Load extensions
LoadExtensions();
@@ -2449,8 +2452,8 @@ void Runtime::Unload() {
UnloadTools();
UnloadExtensions();
amd::hsa::loader::Loader::Destroy(loader_);
loader_ = nullptr;
amd::hsa::loader::Loader::Destroy(loader_.get());
loader_.reset();
for(auto nodeAgent: agents_by_node_) {
for (auto agent: nodeAgent.second)
@@ -2462,17 +2465,17 @@ void Runtime::Unload() {
if (vm_fault_signal_ != nullptr) {
vm_fault_signal_->DestroySignal();
vm_fault_signal_ = nullptr;
vm_fault_signal_.reset();
}
core::InterruptSignal::DestroyEvent(vm_fault_event_);
vm_fault_event_ = nullptr;
vm_fault_event_.reset();
if (hw_exception_signal_ != nullptr) {
hw_exception_signal_->DestroySignal();
hw_exception_signal_ = nullptr;
hw_exception_signal_.reset();
}
core::InterruptSignal::DestroyEvent(hw_exception_event_);
hw_exception_event_ = nullptr;
hw_exception_event_.reset();
SharedSignalPool.clear();
@@ -2890,7 +2893,7 @@ void Runtime::AsyncEvents::Clear() {
hsa_status_t Runtime::SetCustomSystemEventHandler(hsa_amd_system_event_callback_t callback,
void* data) {
ScopedAcquire<KernelMutex> lock(&system_event_lock_);
std::lock_guard<std::mutex> lock(system_event_lock_);
system_event_handlers_.push_back(
std::make_pair(AMD::callback_t<hsa_amd_system_event_callback_t>(callback), data));
return HSA_STATUS_SUCCESS;
@@ -2898,7 +2901,7 @@ hsa_status_t Runtime::SetCustomSystemEventHandler(hsa_amd_system_event_callback_
std::vector<std::pair<AMD::callback_t<hsa_amd_system_event_callback_t>, void*>>
Runtime::GetSystemEventHandlers() {
ScopedAcquire<KernelMutex> lock(&system_event_lock_);
std::lock_guard<std::mutex> lock(system_event_lock_);
return system_event_handlers_;
}
@@ -3269,7 +3272,7 @@ hsa_status_t Runtime::SvmPrefetch(void* ptr, size_t size, hsa_agent_t agent,
}
{
ScopedAcquire<KernelMutex> lock(&prefetch_lock_);
std::lock_guard<std::mutex> lock(prefetch_lock_);
// Remove all fully overlapped and trim partially overlapped ranges.
// Get iteration bounds
auto start = prefetch_map_.upper_bound(base);
@@ -3332,7 +3335,7 @@ hsa_status_t Runtime::SvmPrefetch(void* ptr, size_t size, hsa_agent_t agent,
// Remove the prefetch's ranges from the map.
static auto removePrefetchRanges = [](PrefetchOp* op) {
ScopedAcquire<KernelMutex> lock(&Runtime::runtime_singleton_->prefetch_lock_);
std::lock_guard<std::mutex> lock(Runtime::runtime_singleton_->prefetch_lock_);
auto it = op->prefetch_map_entry;
while (it != Runtime::runtime_singleton_->prefetch_map_.end()) {
auto next = it->second.next;
@@ -3389,7 +3392,7 @@ Agent* Runtime::GetSVMPrefetchAgent(void* ptr, size_t size) {
std::vector<std::pair<uintptr_t, uintptr_t>> holes;
ScopedAcquire<KernelMutex> lock(&Runtime::runtime_singleton_->prefetch_lock_);
std::lock_guard<std::mutex> lock(Runtime::runtime_singleton_->prefetch_lock_);
auto start = prefetch_map_.upper_bound(base);
if (start != prefetch_map_.begin()) start--;
auto stop = prefetch_map_.lower_bound(end);
@@ -3441,7 +3444,7 @@ Agent* Runtime::GetSVMPrefetchAgent(void* ptr, size_t size) {
hsa_status_t Runtime::DmaBufExport(const void* ptr, size_t size, int* dmabuf, uint64_t* offset,
uint64_t flags) {
#ifdef __linux__
ScopedAcquire<KernelSharedMutex::Shared> lock(memory_lock_.shared());
std::shared_lock<std::shared_mutex> lock(memory_lock_);
// Lookup containing allocation.
auto mem = allocation_map_.upper_bound(ptr);
if (mem != allocation_map_.begin()) {
@@ -3507,7 +3510,7 @@ hsa_status_t Runtime::VMemoryAddressReserve(void** va, size_t size, uint64_t add
if (!alignment) alignment = rocr::os::PageSize();
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
std::lock_guard<std::shared_mutex> lock(memory_lock_);
if (flags & HSA_AMD_VMEM_ADDRESS_NO_REGISTER) {
size_t requested = size + alignment - rocr::os::PageSize();
@@ -3548,7 +3551,7 @@ hsa_status_t Runtime::VMemoryAddressReserve(void** va, size_t size, uint64_t add
}
hsa_status_t Runtime::VMemoryAddressFree(void* va, size_t size) {
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
std::lock_guard<std::shared_mutex> lock(memory_lock_);
std::map<const void*, AddressHandle>::iterator it = reserved_address_map_.find(va);
if (it == reserved_address_map_.end()) {
@@ -3580,7 +3583,7 @@ hsa_status_t Runtime::VMemoryHandleCreate(const MemoryRegion* region, size_t siz
if (!IsMultipleOf(size, memRegion->GetPageSize()))
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
std::lock_guard<std::shared_mutex> lock(memory_lock_);
ThunkHandle user_mode_driver_handle;
hsa_status_t status =
region->Allocate(size, alloc_flags, &user_mode_driver_handle, 0);
@@ -3597,7 +3600,7 @@ hsa_status_t Runtime::VMemoryHandleCreate(const MemoryRegion* region, size_t siz
}
hsa_status_t Runtime::VMemoryHandleRelease(hsa_amd_vmem_alloc_handle_t memoryOnlyHandle) {
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
std::lock_guard<std::shared_mutex> lock(memory_lock_);
auto memoryHandleIt = memory_handle_map_.find(MemoryHandle::Convert(memoryOnlyHandle));
if (memoryHandleIt == memory_handle_map_.end()) {
@@ -3628,7 +3631,7 @@ hsa_status_t Runtime::VMemoryHandleMap(void* va, size_t size, size_t in_offset,
uint64_t offset = 0, ret;
uint64_t drm_cpu_addr = 0;
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
std::lock_guard<std::shared_mutex> lock(memory_lock_);
auto addressHandle = VMemoryFindReservedAddressHandle(va);
if (addressHandle == nullptr ||
reinterpret_cast<uint8_t*>(va) + size >
@@ -3703,7 +3706,7 @@ hsa_status_t Runtime::VMemoryHandleMap(void* va, size_t size, size_t in_offset,
}
hsa_status_t Runtime::VMemoryHandleUnmap(void* va, size_t size) {
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
std::lock_guard<std::shared_mutex> lock(memory_lock_);
std::list<std::pair<void*, MappedHandle*>> mappedHandles;
// va + size may consist of multiple MappedHandle's.
@@ -3921,7 +3924,7 @@ hsa_status_t Runtime::VMemorySetAccess(void* va, size_t size,
if (targetAgent == NULL || !targetAgent->IsValid()) return HSA_STATUS_ERROR_INVALID_AGENT;
}
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
std::lock_guard<std::shared_mutex> lock(memory_lock_);
auto addressHandle = VMemoryFindReservedAddressHandle(va);
if (addressHandle == nullptr ||
@@ -4014,7 +4017,7 @@ hsa_status_t Runtime::VMemoryGetAccess(const void* va, hsa_access_permission_t*
*perms = HSA_ACCESS_PERMISSION_NONE;
bool mappedHandleFound = false;
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
std::lock_guard<std::shared_mutex> lock(memory_lock_);
auto mappedHandleIt = mapped_handle_map_.upper_bound(va);
if (mappedHandleIt != mapped_handle_map_.begin()) {
@@ -4076,8 +4079,8 @@ hsa_status_t Runtime::VMemoryImportShareableHandle(int dmabuf_fd,
return;
}
for (const core::MemoryRegion* region : agent->regions()) {
const AMD::MemoryRegion* amd_region = reinterpret_cast<const AMD::MemoryRegion*>(region);
for (const auto& region : agent->regions()) {
const AMD::MemoryRegion* amd_region = reinterpret_cast<const AMD::MemoryRegion*>(region.get());
// TODO: Verify that this works on a system with FINE_GRAINED memory.
// System's with FINE_GRAINED will have both COARSE and FINE grain... need to get the
@@ -58,7 +58,7 @@
namespace rocr {
namespace core {
KernelMutex Signal::ipcLock_;
std::mutex Signal::ipcLock_;
std::map<decltype(hsa_signal_t::handle), Signal*> Signal::ipcMap_;
void SharedSignalPool_t::clear() {
@@ -76,7 +76,7 @@ void SharedSignalPool_t::clear() {
}
SharedSignal* SharedSignalPool_t::alloc() {
ScopedAcquire<HybridMutex> lock(&lock_);
std::lock_guard<HybridMutex> lock(lock_);
if (free_list_.empty()) {
SharedSignal* block = reinterpret_cast<SharedSignal*>(
allocate_()(block_size_ * sizeof(SharedSignal), __alignof(SharedSignal), core::MemoryRegion::AllocateNonPaged, 0));
@@ -109,7 +109,7 @@ void SharedSignalPool_t::free(SharedSignal* ptr) {
if (ptr == nullptr) return;
ptr->~SharedSignal();
ScopedAcquire<HybridMutex> lock(&lock_);
std::lock_guard<HybridMutex> lock(lock_);
ifdebug {
bool valid = false;
@@ -134,7 +134,7 @@ LocalSignal::LocalSignal(hsa_signal_value_t initial_value, bool exportable)
}
void Signal::registerIpc() {
ScopedAcquire<KernelMutex> lock(&ipcLock_);
std::lock_guard<std::mutex> lock(ipcLock_);
auto handle = Convert(this);
assert(ipcMap_.find(handle.handle) == ipcMap_.end() &&
"Can't register the same IPC signal twice.");
@@ -142,7 +142,7 @@ void Signal::registerIpc() {
}
bool Signal::deregisterIpc() {
ScopedAcquire<KernelMutex> lock(&ipcLock_);
std::lock_guard<std::mutex> lock(ipcLock_);
if (refcount_ != 0) return false;
auto handle = Convert(this);
const auto& it = ipcMap_.find(handle.handle);
@@ -152,14 +152,14 @@ bool Signal::deregisterIpc() {
}
Signal* Signal::lookupIpc(hsa_signal_t signal) {
ScopedAcquire<KernelMutex> lock(&ipcLock_);
std::lock_guard<std::mutex> lock(ipcLock_);
const auto& it = ipcMap_.find(signal.handle);
if (it == ipcMap_.end()) return nullptr;
return it->second;
}
Signal* Signal::duplicateIpc(hsa_signal_t signal) {
ScopedAcquire<KernelMutex> lock(&ipcLock_);
std::lock_guard<std::mutex> lock(ipcLock_);
const auto& it = ipcMap_.find(signal.handle);
if (it == ipcMap_.end()) return nullptr;
it->second->refcount_++;
@@ -125,16 +125,16 @@ template <typename T> class lazy_ptr {
private:
mutable std::unique_ptr<T> obj;
mutable std::function<T*(void)> func;
mutable KernelMutex lock;
mutable std::mutex lock;
// Separated from make to improve inlining.
void make_body(bool block) const {
if (block) {
lock.Acquire();
} else if (!lock.Try()) {
lock.lock();
} else if (!lock.try_lock()) {
return;
}
MAKE_SCOPE_GUARD([&]() { lock.Release(); });
MAKE_SCOPE_GUARD([&]() { lock.unlock(); });
if (func == nullptr) return;
T* ptr = func();
obj.reset(ptr);
@@ -90,6 +90,11 @@ class HybridMutex {
os::PostSemaphore(sem_);
}
// To add compatibility with std::lock_guard
void lock() { Acquire(); }
void unlock() { Release(); }
bool try_lock() { return Try(); }
private:
std::atomic<int> lock_;
os::Semaphore sem_;
@@ -100,27 +105,6 @@ class HybridMutex {
DISALLOW_COPY_AND_ASSIGN(HybridMutex);
};
/// @brief: a class represents a kernel mutex.
/// Uses the kernel's scheduler to keep the waiting thread from being scheduled
/// until the lock is released (Best for long waits, though anything using
/// a kernel object is a long wait).
class KernelMutex {
public:
KernelMutex() { lock_ = os::CreateMutex(); }
~KernelMutex() { os::DestroyMutex(lock_); }
bool Try() { return os::TryAcquireMutex(lock_); }
bool Acquire() { return os::AcquireMutex(lock_); }
void Release() { os::ReleaseMutex(lock_); }
private:
os::Mutex lock_;
/// @brief: Disable copiable and assignable ability.
DISALLOW_COPY_AND_ASSIGN(KernelMutex);
};
/// @brief: represents a spin lock.
/// For very short hold durations on the order of the thread scheduling
/// quanta or less.
@@ -143,6 +127,11 @@ class SpinMutex {
}
void Release() { lock_ = 0; }
// To add compatibility with std::lock_guard
void lock() { Acquire(); }
void unlock() { Release(); }
bool try_lock() { return Try(); }
private:
std::atomic<int> lock_;
@@ -167,124 +156,6 @@ class KernelEvent {
DISALLOW_COPY_AND_ASSIGN(KernelEvent);
};
/// @brief: represents a yielding shared mutex.
/// aka read/write mutex
class KernelSharedMutex {
public:
/// @brief: Interfaces ScopedAcquire to shared operations.
class Shared {
public:
explicit Shared(KernelSharedMutex* lock) : lock_(lock) {}
bool Try() { return lock_->TryShared(); }
bool Acquire() { return lock_->AcquireShared(); }
void Release() { lock_->ReleaseShared(); }
private:
KernelSharedMutex* lock_;
};
KernelSharedMutex() { lock_ = os::CreateSharedMutex(); }
~KernelSharedMutex() { os::DestroySharedMutex(lock_); }
// Exclusive mode operations
bool Try() { return os::TryAcquireSharedMutex(lock_); }
bool Acquire() { return os::AcquireSharedMutex(lock_); }
void Release() { os::ReleaseSharedMutex(lock_); }
// Shared mode operations
bool TryShared() { return os::TrySharedAcquireSharedMutex(lock_); }
bool AcquireShared() { return os::SharedAcquireSharedMutex(lock_); }
void ReleaseShared() { os::SharedReleaseSharedMutex(lock_); }
// Return shared operations interface
Shared shared() { return Shared(this); }
private:
os::SharedMutex lock_;
/// @brief: Disable copiable and assignable ability.
DISALLOW_COPY_AND_ASSIGN(KernelSharedMutex);
};
/// @brief: Type trait to identify mutex types
template <class T> class isMutex {
public:
enum { value = false };
};
template <> class isMutex<HybridMutex> {
public:
enum { value = true };
};
template <> class isMutex<KernelMutex> {
public:
enum { value = true };
};
template <> class isMutex<SpinMutex> {
public:
enum { value = true };
};
template <> class isMutex<KernelSharedMutex> {
public:
enum { value = true };
};
/// @brief: A class behaves as a lock in a scope. When trying to enter into the
/// critical section, creat a object of this class. After the control path goes
/// out of the scope, it will release the lock automatically.
template <class LockType> class ScopedAcquire {
public:
/// @brief: When constructing, acquire the lock.
/// @param: lock(Input), pointer to an existing lock.
explicit ScopedAcquire(LockType* lock) : lock_(lock), doRelease(true) {
static_assert(isMutex<LockType>::value, "ScopedAcquire requires a mutex type.");
lock_.Acquire();
}
explicit ScopedAcquire(LockType lock) : lock_(lock), doRelease(true) {
static_assert(!isMutex<LockType>::value, "Mutex types are not copyable.");
lock_.Acquire();
}
/// @brief: when destructing, release the lock.
~ScopedAcquire() {
if (doRelease) lock_.Release();
}
/// @brief: Release the lock early. Avoid using when possible.
void Release() {
lock_.Release();
doRelease = false;
}
private:
/// @brief: Adapts between pointers to mutex types and mutex pointer types.
template <class T, bool B> class container {
public:
container(T* lock) : lock_(lock) {}
__forceinline bool Acquire() { return lock_->Acquire(); }
__forceinline void Release() { return lock_->Release(); }
private:
T* lock_;
};
/// @brief: Specialization for mutex pointer types.
template <class T> class container<T, false> {
public:
container(T lock) : lock_(lock) {}
__forceinline bool Acquire() { return lock_.Acquire(); }
__forceinline void Release() { return lock_.Release(); }
private:
T lock_;
};
container<LockType, isMutex<LockType>::value> lock_;
bool doRelease;
/// @brief: Disable copiable and assignable ability.
DISALLOW_COPY_AND_ASSIGN(ScopedAcquire);
};
} // namespace rocr
#endif // HSA_RUNTIME_CORE_SUTIL_LOCKS_H_
@@ -286,11 +286,6 @@ namespace code {
}
}
AmdHsaCode::~AmdHsaCode()
{
for (Symbol* sym : symbols) { delete sym; }
}
bool AmdHsaCode::PullElf()
{
uint32_t majorVersion, minorVersion;
@@ -330,7 +325,7 @@ namespace code {
}
for (size_t i = 0; i < img->symtab()->symbolCount(); ++i) {
amd::elf::Symbol* elfsym = img->symtab()->symbol(i);
Symbol* sym = 0;
std::shared_ptr<Symbol> sym;
switch (elfsym->type()) {
case STT_AMDGPU_HSA_KERNEL: {
amd::elf::Section* sec = elfsym->section();
@@ -347,12 +342,12 @@ namespace code {
out << "Failed to get AMD Kernel Code for symbol " << elfsym->name() << std::endl;
return false;
}
sym = new KernelSymbol(elfsym, &akc);
sym = std::make_shared<KernelSymbol>(elfsym, &akc);
break;
}
case STT_OBJECT:
case STT_COMMON:
sym = new VariableSymbol(elfsym);
sym = std::make_shared<VariableSymbol>(elfsym);
break;
default:
break; // Skip unknown symbols.
@@ -924,9 +919,9 @@ namespace code {
std::string(module_name ? module_name : ""),
std::string(symbol_name)
);
for (Symbol* sym : symbols) {
for (const auto& sym : symbols) {
if (sym->Name() == mname) {
*s = Symbol::ToHandle(sym);
*s = Symbol::ToHandle(sym.get());
return HSA_STATUS_SUCCESS;
}
}
@@ -940,8 +935,8 @@ namespace code {
void* data),
void* data)
{
for (Symbol* sym : symbols) {
hsa_code_symbol_t s = Symbol::ToHandle(sym);
for (const auto& sym : symbols) {
hsa_code_symbol_t s = Symbol::ToHandle(sym.get());
hsa_status_t status = callback(code_object, s, data);
if (status != HSA_STATUS_SUCCESS) { return status; }
}
@@ -1144,8 +1139,8 @@ namespace code {
{
if (nullptr == img) { return nullptr; }
if (!section) { section = HsaText(); }
symbols.push_back(new KernelSymbol(img->symtab()->addSymbol(section, name, 0, 0, type, binding, other), nullptr));
return symbols.back();
symbols.push_back(std::make_shared<KernelSymbol>(img->symtab()->addSymbol(section, name, 0, 0, type, binding, other), nullptr));
return symbols.back().get();
}
Symbol* AmdHsaCode::AddVariableSymbol(const std::string &name,
@@ -1157,8 +1152,8 @@ namespace code {
uint64_t size)
{
if (nullptr == img) { return nullptr; }
symbols.push_back(new VariableSymbol(img->symtab()->addSymbol(section, name, value, size, type, binding, other)));
return symbols.back();
symbols.push_back(std::make_shared<VariableSymbol>(img->symtab()->addSymbol(section, name, value, size, type, binding, other)));
return symbols.back().get();
}
void AmdHsaCode::AddSectionSymbols()
@@ -1166,16 +1161,16 @@ namespace code {
if (nullptr == img) { return; }
for (size_t i = 0; i < dataSections.size(); ++i) {
if (dataSections[i] && dataSections[i]->flags() & SHF_ALLOC) {
symbols.push_back(new VariableSymbol(img->symtab()->addSymbol(dataSections[i], "__hsa_section" + dataSections[i]->Name(), 0, 0, STT_SECTION, STB_LOCAL)));
symbols.push_back(std::make_shared<VariableSymbol>(img->symtab()->addSymbol(dataSections[i], "__hsa_section" + dataSections[i]->Name(), 0, 0, STT_SECTION, STB_LOCAL)));
}
}
}
Symbol* AmdHsaCode::GetSymbolByElfIndex(size_t index)
{
for (auto &s : symbols) {
for (const auto &s : symbols) {
if (s && index == s->Index()) {
return s;
return s.get();
}
}
return nullptr;
@@ -1185,7 +1180,7 @@ namespace code {
{
for (auto &s : symbols) {
if (s && n == s->Name()) {
return s;
return s.get();
}
}
return nullptr;
@@ -1747,14 +1742,13 @@ namespace code {
return false;
}
AmdHsaCode* AmdHsaCodeManager::FromHandle(hsa_code_object_t c)
const std::shared_ptr<AmdHsaCode>& AmdHsaCodeManager::FromHandle(hsa_code_object_t c)
{
CodeMap::iterator i = codeMap.find(c.handle);
if (i == codeMap.end()) {
AmdHsaCode* code = new AmdHsaCode();
std::shared_ptr<AmdHsaCode> code = std::make_shared<AmdHsaCode>();
const void* buffer = reinterpret_cast<const void*>(c.handle);
if (!code->InitAsBuffer(buffer, 0)) {
delete code;
return 0;
}
codeMap[c.handle] = code;
@@ -1770,7 +1764,7 @@ namespace code {
// Currently, we do not always create map entry for every code object buffer.
return true;
}
delete i->second;
i->second.reset();
codeMap.erase(i);
return true;
}
@@ -1798,7 +1792,7 @@ namespace code {
}
for (size_t i = 0; i < img->getSymbolTable()->symbolCount(); ++i) {
amd::elf::Symbol* elfsym = img->getSymbolTable()->symbol(i);
Symbol* sym = 0;
std::shared_ptr<Symbol> sym;
switch (elfsym->type()) {
case STT_AMDGPU_HSA_KERNEL: {
amd::elf::Section* sec = elfsym->section();
@@ -1815,12 +1809,12 @@ namespace code {
out << "Failed to get AMD Kernel Code for symbol " << elfsym->name() << std::endl;
return false;
}
sym = new KernelSymbolV2(elfsym, &akc);
sym = std::make_shared<KernelSymbolV2>(elfsym, &akc);
break;
}
case STT_OBJECT:
case STT_COMMON:
sym = new VariableSymbolV2(elfsym);
sym = std::make_shared<VariableSymbolV2>(elfsym);
break;
default:
break; // Skip unknown symbols.
@@ -186,7 +186,6 @@ void Loader::Destroy(Loader *loader)
_amdgpu_r_debug.r_map = nullptr;
_amdgpu_r_debug.r_state = r_debug::RT_CONSISTENT;
r_debug_tail() = nullptr;
delete loader;
}
Executable* AmdHsaCodeLoader::CreateExecutable(
@@ -194,8 +193,8 @@ Executable* AmdHsaCodeLoader::CreateExecutable(
{
WriterLockGuard<ReaderWriterLock> writer_lock(rw_lock_);
executables.push_back(new ExecutableImpl(profile, context, executables.size(), default_float_rounding_mode));
return executables.back();
executables.push_back(std::make_shared<ExecutableImpl>(profile, context, executables.size(), default_float_rounding_mode));
return executables.back().get();
}
Executable* AmdHsaCodeLoader::CreateExecutable(
@@ -206,8 +205,8 @@ Executable* AmdHsaCodeLoader::CreateExecutable(
{
WriterLockGuard<ReaderWriterLock> writer_lock(rw_lock_);
executables.push_back(new ExecutableImpl(profile, std::move(isolated_context), executables.size(), default_float_rounding_mode));
return executables.back();
executables.push_back(std::make_shared<ExecutableImpl>(profile, std::move(isolated_context), executables.size(), default_float_rounding_mode));
return executables.back().get();
}
static void AddCodeObjectInfoIntoDebugMap(link_map* map) {
@@ -254,7 +253,7 @@ hsa_status_t AmdHsaCodeLoader::FreezeExecutable(Executable *executable, const ch
atomic::Fence(std::memory_order_acq_rel);
_loader_debug_state();
atomic::Fence(std::memory_order_acq_rel);
for (auto &lco : reinterpret_cast<ExecutableImpl*>(executable)->loaded_code_objects) {
for (const auto &lco : reinterpret_cast<ExecutableImpl*>(executable)->loaded_code_objects) {
AddCodeObjectInfoIntoDebugMap(&(lco->r_debug_info));
}
atomic::Store(&_amdgpu_r_debug.r_state, r_debug::RT_CONSISTENT, std::memory_order_release);
@@ -270,14 +269,13 @@ void AmdHsaCodeLoader::DestroyExecutable(Executable *executable) {
atomic::Fence(std::memory_order_acq_rel);
_loader_debug_state();
atomic::Fence(std::memory_order_acq_rel);
for (auto &lco : reinterpret_cast<ExecutableImpl*>(executable)->loaded_code_objects) {
for (const auto &lco : reinterpret_cast<ExecutableImpl*>(executable)->loaded_code_objects) {
RemoveCodeObjectInfoFromDebugMap(&(lco->r_debug_info));
}
atomic::Store(&_amdgpu_r_debug.r_state, r_debug::RT_CONSISTENT, std::memory_order_release);
_loader_debug_state();
executables[((ExecutableImpl*)executable)->id()] = nullptr;
delete executable;
executables[static_cast<ExecutableImpl*>(executable)->id()].reset();
}
hsa_status_t AmdHsaCodeLoader::IterateExecutables(
@@ -289,9 +287,9 @@ hsa_status_t AmdHsaCodeLoader::IterateExecutables(
WriterLockGuard<ReaderWriterLock> writer_lock(rw_lock_);
assert(callback);
for (auto &exec : executables) {
for (const auto &exec : executables) {
if(exec != nullptr){
hsa_status_t status = callback(Executable::Handle(exec), data);
hsa_status_t status = callback(Executable::Handle(exec.get()), data);
if (status != HSA_STATUS_SUCCESS) {
return status;
}
@@ -318,7 +316,7 @@ hsa_status_t AmdHsaCodeLoader::QuerySegmentDescriptors(
this->EnableReadOnlyMode();
size_t actual_num_segment_descriptors = 0;
for (auto &executable : executables) {
for (const auto &executable : executables) {
if (executable) {
actual_num_segment_descriptors += executable->GetNumSegmentDescriptors();
}
@@ -335,7 +333,7 @@ hsa_status_t AmdHsaCodeLoader::QuerySegmentDescriptors(
}
size_t i = 0;
for (auto &executable : executables) {
for (const auto &executable : executables) {
if (executable) {
i += executable->QuerySegmentDescriptors(segment_descriptors, actual_num_segment_descriptors, i);
}
@@ -352,7 +350,7 @@ uint64_t AmdHsaCodeLoader::FindHostAddress(uint64_t device_address)
return 0;
}
for (auto &exec : executables) {
for (const auto &exec : executables) {
if (exec != nullptr) {
uint64_t host_address = exec->FindHostAddress(device_address);
if (host_address != 0) {
@@ -371,9 +369,9 @@ void AmdHsaCodeLoader::PrintHelp(std::ostream& out)
void AmdHsaCodeLoader::EnableReadOnlyMode()
{
rw_lock_.ReaderLock();
for (auto &executable : executables) {
for (const auto &executable : executables) {
if (executable) {
((ExecutableImpl*)executable)->EnableReadOnlyMode();
((ExecutableImpl*)executable.get())->EnableReadOnlyMode();
}
}
}
@@ -381,9 +379,9 @@ void AmdHsaCodeLoader::EnableReadOnlyMode()
void AmdHsaCodeLoader::DisableReadOnlyMode()
{
rw_lock_.ReaderUnlock();
for (auto &executable : executables) {
for (const auto &executable : executables) {
if (executable) {
((ExecutableImpl*)executable)->DisableReadOnlyMode();
((ExecutableImpl*)executable.get())->DisableReadOnlyMode();
}
}
}
@@ -781,18 +779,10 @@ ExecutableImpl::ExecutableImpl(
}
ExecutableImpl::~ExecutableImpl() {
for (ExecutableObject* o : objects) {
for (const auto& o : objects) {
o->Destroy();
delete o;
}
objects.clear();
for (auto &symbol_entry : program_symbols_) {
delete symbol_entry.second;
}
for (auto &symbol_entry : agent_symbols_) {
delete symbol_entry.second;
}
}
hsa_status_t ExecutableImpl::DefineProgramExternalVariable(
@@ -812,7 +802,7 @@ hsa_status_t ExecutableImpl::DefineProgramExternalVariable(
program_symbols_.insert(
std::make_pair(std::string(name),
new VariableSymbol(true,
std::make_shared<VariableSymbol>(true,
"", // Only program linkage symbols can be
// defined.
std::string(name),
@@ -848,7 +838,7 @@ hsa_status_t ExecutableImpl::DefineAgentExternalVariable(
auto insert_status = agent_symbols_.insert(
std::make_pair(std::make_pair(std::string(name), agent),
new VariableSymbol(true,
std::make_shared<VariableSymbol>(true,
"", // Only program linkage symbols can be
// defined.
std::string(name),
@@ -896,14 +886,14 @@ Symbol* ExecutableImpl::GetSymbolInternal(
if (!agent) {
auto program_symbol = program_symbols_.find(mangled_name);
if (program_symbol != program_symbols_.end()) {
return program_symbol->second;
return program_symbol->second.get();
}
return nullptr;
}
auto agent_symbol = agent_symbols_.find(std::make_pair(mangled_name, *agent));
if (agent_symbol != agent_symbols_.end()) {
return agent_symbol->second;
return agent_symbol->second.get();
}
return nullptr;
}
@@ -916,14 +906,14 @@ hsa_status_t ExecutableImpl::IterateSymbols(
for (auto &symbol_entry : program_symbols_) {
hsa_status_t hsc =
callback(Executable::Handle(this), Symbol::Handle(symbol_entry.second), data);
callback(Executable::Handle(this), Symbol::Handle(symbol_entry.second.get()), data);
if (HSA_STATUS_SUCCESS != hsc) {
return hsc;
}
}
for (auto &symbol_entry : agent_symbols_) {
hsa_status_t hsc =
callback(Executable::Handle(this), Symbol::Handle(symbol_entry.second), data);
callback(Executable::Handle(this), Symbol::Handle(symbol_entry.second.get()), data);
if (HSA_STATUS_SUCCESS != hsc) {
return hsc;
}
@@ -948,7 +938,7 @@ hsa_status_t ExecutableImpl::IterateAgentSymbols(
}
hsa_status_t status = callback(
Executable::Handle(this), agent, Symbol::Handle(symbol_entry.second),
Executable::Handle(this), agent, Symbol::Handle(symbol_entry.second.get()),
data);
if (status != HSA_STATUS_SUCCESS) {
return status;
@@ -968,7 +958,7 @@ hsa_status_t ExecutableImpl::IterateProgramSymbols(
for (auto &symbol_entry : program_symbols_) {
hsa_status_t status = callback(
Executable::Handle(this), Symbol::Handle(symbol_entry.second), data);
Executable::Handle(this), Symbol::Handle(symbol_entry.second.get()), data);
if (status != HSA_STATUS_SUCCESS) {
return status;
}
@@ -987,10 +977,10 @@ hsa_status_t ExecutableImpl::IterateLoadedCodeObjects(
ReaderLockGuard<ReaderWriterLock> reader_lock(rw_lock_);
assert(callback);
for (auto &loaded_code_object : loaded_code_objects) {
for (const auto& loaded_code_object : loaded_code_objects) {
hsa_status_t status = callback(
Executable::Handle(this),
LoadedCodeObject::Handle(loaded_code_object),
LoadedCodeObject::Handle(loaded_code_object.get()),
data);
if (status != HSA_STATUS_SUCCESS) {
return status;
@@ -1004,7 +994,7 @@ size_t ExecutableImpl::GetNumSegmentDescriptors()
{
// assuming we are in readonly mode.
size_t actual_num_segment_descriptors = 0;
for (auto &obj : loaded_code_objects) {
for (const auto &obj : loaded_code_objects) {
actual_num_segment_descriptors += obj->LoadedSegments().size();
}
return actual_num_segment_descriptors;
@@ -1020,7 +1010,7 @@ size_t ExecutableImpl::QuerySegmentDescriptors(
assert(first_empty_segment_descriptor < total_num_segment_descriptors);
size_t i = first_empty_segment_descriptor;
for (auto &obj : loaded_code_objects) {
for (const auto &obj : loaded_code_objects) {
assert(i < total_num_segment_descriptors);
for (auto &seg : obj->LoadedSegments()) {
segment_descriptors[i].agent = seg->Agent();
@@ -1084,11 +1074,11 @@ hsa_executable_t AmdHsaCodeLoader::FindExecutable(uint64_t device_address)
return execHandle;
}
for (auto &exec : executables) {
for (const auto &exec : executables) {
if (exec != nullptr) {
uint64_t host_address = exec->FindHostAddress(device_address);
if (host_address != 0) {
return Executable::Handle(exec);
return Executable::Handle(exec.get());
}
}
}
@@ -1098,7 +1088,7 @@ hsa_executable_t AmdHsaCodeLoader::FindExecutable(uint64_t device_address)
uint64_t ExecutableImpl::FindHostAddress(uint64_t device_address)
{
ReaderLockGuard<ReaderWriterLock> reader_lock(rw_lock_);
for (auto &obj : loaded_code_objects) {
for (const auto &obj : loaded_code_objects) {
assert(obj);
for (auto &seg : obj->LoadedSegments()) {
assert(seg);
@@ -1224,7 +1214,7 @@ hsa_status_t ExecutableImpl::LoadCodeObject(
uint32_t codeNum = NextCodeObjectNum();
code.reset(new code::AmdHsaCode());
code = std::make_unique<code::AmdHsaCode>();
std::string substituteFileName;
for (const Substitute& ss : substitutes) {
@@ -1306,8 +1296,8 @@ hsa_status_t ExecutableImpl::LoadCodeObject(
hsa_status_t status;
objects.push_back(new LoadedCodeObjectImpl(this, agent, code->ElfData(), code->ElfSize()));
loaded_code_objects.push_back((LoadedCodeObjectImpl*)objects.back());
objects.push_back(std::make_shared<LoadedCodeObjectImpl>(this, agent, code->ElfData(), code->ElfSize()));
loaded_code_objects.push_back(std::static_pointer_cast<LoadedCodeObjectImpl>(objects.back()));
status = LoadSegments(agent, code.get(), majorVersion);
if (status != HSA_STATUS_SUCCESS) return status;
@@ -1338,7 +1328,7 @@ hsa_status_t ExecutableImpl::LoadCodeObject(
loaded_code_objects.back()->r_debug_info.l_prev = nullptr;
loaded_code_objects.back()->r_debug_info.l_next = nullptr;
if (nullptr != loaded_code_object) { *loaded_code_object = LoadedCodeObject::Handle(loaded_code_objects.back()); }
if (nullptr != loaded_code_object) { *loaded_code_object = LoadedCodeObject::Handle(loaded_code_objects.back().get()); }
return HSA_STATUS_SUCCESS;
}
@@ -1376,18 +1366,18 @@ hsa_status_t ExecutableImpl::LoadSegmentsV2(hsa_agent_t agent,
AMD_ISA_ALIGN_BYTES, true);
if (!ptr) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
Segment *load_segment = new Segment(this, agent, AMDGPU_HSA_SEGMENT_CODE_AGENT,
std::shared_ptr<Segment> load_segment = std::make_shared<Segment>(this, agent, AMDGPU_HSA_SEGMENT_CODE_AGENT,
ptr, size, vaddr, c->DataSegment(0)->offset());
if (!load_segment) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
hsa_status_t status = HSA_STATUS_SUCCESS;
for (size_t i = 0; i < c->DataSegmentCount(); ++i) {
status = LoadSegmentV2(c->DataSegment(i), load_segment);
status = LoadSegmentV2(c->DataSegment(i), load_segment.get());
if (status != HSA_STATUS_SUCCESS) return status;
}
objects.push_back(load_segment);
loaded_code_objects.back()->LoadedSegments().push_back(load_segment);
loaded_code_objects.back()->LoadedSegments().push_back(load_segment.get());
return HSA_STATUS_SUCCESS;
}
@@ -1398,7 +1388,7 @@ hsa_status_t ExecutableImpl::LoadSegmentV1(hsa_agent_t agent,
if (s->memSize() == 0)
return HSA_STATUS_SUCCESS;
amdgpu_hsa_elf_segment_t segment = (amdgpu_hsa_elf_segment_t)(s->type() - PT_LOOS);
Segment *new_seg = nullptr;
std::shared_ptr<Segment> new_seg;
bool need_alloc = true;
if (segment == AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM && nullptr != program_allocation_segment) {
new_seg = program_allocation_segment;
@@ -1407,7 +1397,7 @@ hsa_status_t ExecutableImpl::LoadSegmentV1(hsa_agent_t agent,
if (need_alloc) {
void* ptr = context_->SegmentAlloc(segment, agent, s->memSize(), s->align(), true);
if (!ptr) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; }
new_seg = new Segment(this, agent, segment, ptr, s->memSize(), s->vaddr(), s->offset());
new_seg = std::make_shared<Segment>(this, agent, segment, ptr, s->memSize(), s->vaddr(), s->offset());
new_seg->Copy(s->vaddr(), s->data(), s->imageSize());
objects.push_back(new_seg);
@@ -1416,7 +1406,7 @@ hsa_status_t ExecutableImpl::LoadSegmentV1(hsa_agent_t agent,
}
}
assert(new_seg);
loaded_code_objects.back()->LoadedSegments().push_back(new_seg);
loaded_code_objects.back()->LoadedSegments().push_back(new_seg.get());
return HSA_STATUS_SUCCESS;
}
@@ -1471,7 +1461,7 @@ hsa_status_t ExecutableImpl::LoadDefinitionSymbol(hsa_agent_t agent,
}
uint64_t address = SymbolAddress(agent, sym);
SymbolImpl *symbol = nullptr;
std::shared_ptr<SymbolImpl> symbol;
if (string_ends_with(sym->GetSymbolName(), ".kd")) {
// V3.
llvm::amdhsa::kernel_descriptor_t kd;
@@ -1486,7 +1476,7 @@ hsa_status_t ExecutableImpl::LoadDefinitionSymbol(hsa_agent_t agent,
uint64_t size = sym->Size();
KernelSymbol *kernel_symbol = new KernelSymbol(true,
std::shared_ptr<KernelSymbol> kernel_symbol = std::make_shared<KernelSymbol>(true,
sym->GetModuleName(),
sym->GetSymbolName(),
sym->Linkage(),
@@ -1502,7 +1492,7 @@ hsa_status_t ExecutableImpl::LoadDefinitionSymbol(hsa_agent_t agent,
address);
symbol = kernel_symbol;
} else if (sym->IsVariableSymbol()) {
symbol = new VariableSymbol(true,
symbol = std::make_shared<VariableSymbol>(true,
sym->GetModuleName(),
sym->GetSymbolName(),
sym->Linkage(),
@@ -1537,7 +1527,7 @@ hsa_status_t ExecutableImpl::LoadDefinitionSymbol(hsa_agent_t agent,
// calculate end of segment - symbol value.
size = sym->GetSection()->size() - sym->SectionOffset();
}
KernelSymbol *kernel_symbol = new KernelSymbol(true,
std::shared_ptr<KernelSymbol> kernel_symbol = std::make_shared<KernelSymbol>(true,
sym->GetModuleName(),
sym->GetSymbolName(),
sym->Linkage(),
@@ -1970,7 +1960,7 @@ void ExecutableImpl::Print(std::ostream& out)
<< std::endl << std::endl;
out << "Loaded Objects (total " << objects.size() << ")" << std::endl;
size_t i = 0;
for (ExecutableObject* o : objects) {
for (const auto& o : objects) {
out << "Loaded Object " << i++ << ": ";
o->Print(out);
out << std::endl;
@@ -461,7 +461,7 @@ public:
};
typedef std::string ProgramSymbol;
typedef std::unordered_map<ProgramSymbol, SymbolImpl*> ProgramSymbolMap;
typedef std::unordered_map<ProgramSymbol, std::shared_ptr<SymbolImpl>> ProgramSymbolMap;
typedef std::pair<std::string, hsa_agent_t> AgentSymbol;
struct ASC {
@@ -476,7 +476,7 @@ struct ASH {
return h ^ (i << 1);
}
};
typedef std::unordered_map<AgentSymbol, SymbolImpl*, ASH, ASC> AgentSymbolMap;
typedef std::unordered_map<AgentSymbol, std::shared_ptr<SymbolImpl>, ASH, ASC> AgentSymbolMap;
class ExecutableImpl final: public Executable {
friend class AmdHsaCodeLoader;
@@ -634,15 +634,15 @@ private:
ProgramSymbolMap program_symbols_;
AgentSymbolMap agent_symbols_;
std::vector<ExecutableObject*> objects;
Segment *program_allocation_segment;
std::vector<LoadedCodeObjectImpl*> loaded_code_objects;
std::vector<std::shared_ptr<ExecutableObject>> objects;
std::shared_ptr<Segment> program_allocation_segment;
std::vector<std::shared_ptr<LoadedCodeObjectImpl>> loaded_code_objects;
};
class AmdHsaCodeLoader : public Loader {
private:
Context* context;
std::vector<Executable*> executables;
std::vector<std::shared_ptr<Executable>> executables;
amd::hsa::common::ReaderWriterLock rw_lock_;
public:
@@ -282,7 +282,7 @@ hsa_status_t PcsRuntime::PcSamplingCreateInternal(
size_t interval, size_t latency, size_t buffer_size,
hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb, void* client_cb_data,
hsa_ven_amd_pcs_t* handle, agent_pcs_create_fn_t agent_pcs_create_fn) {
ScopedAcquire<KernelMutex> lock(&pc_sampling_lock_);
std::lock_guard<std::mutex> lock(pc_sampling_lock_);
handle->handle = ++pc_sampling_id_;
// create a new PcSamplingSession(agent, method, units, interval, latency, buffer_size,
@@ -305,7 +305,7 @@ hsa_status_t PcsRuntime::PcSamplingCreateInternal(
}
hsa_status_t PcsRuntime::PcSamplingDestroy(hsa_ven_amd_pcs_t handle) {
ScopedAcquire<KernelMutex> lock(&pc_sampling_lock_);
std::lock_guard<std::mutex> lock(pc_sampling_lock_);
auto pcSamplingSessionIt = pc_sampling_.find(static_cast<uint64_t>(handle.handle));
if (pcSamplingSessionIt == pc_sampling_.end()) {
debug_warning(false && "Cannot find PcSampling session");
@@ -319,7 +319,7 @@ hsa_status_t PcsRuntime::PcSamplingDestroy(hsa_ven_amd_pcs_t handle) {
}
hsa_status_t PcsRuntime::PcSamplingStart(hsa_ven_amd_pcs_t handle) {
ScopedAcquire<KernelMutex> lock(&pc_sampling_lock_);
std::lock_guard<std::mutex> lock(pc_sampling_lock_);
auto pcSamplingSessionIt = pc_sampling_.find(static_cast<uint64_t>(handle.handle));
if (pcSamplingSessionIt == pc_sampling_.end()) {
debug_warning(false && "Cannot find PcSampling session");
@@ -331,7 +331,7 @@ hsa_status_t PcsRuntime::PcSamplingStart(hsa_ven_amd_pcs_t handle) {
}
hsa_status_t PcsRuntime::PcSamplingStop(hsa_ven_amd_pcs_t handle) {
ScopedAcquire<KernelMutex> lock(&pc_sampling_lock_);
std::lock_guard<std::mutex> lock(pc_sampling_lock_);
auto pcSamplingSessionIt = pc_sampling_.find(static_cast<uint64_t>(handle.handle));
if (pcSamplingSessionIt == pc_sampling_.end()) {
debug_warning(false && "Cannot find PcSampling session");
@@ -343,7 +343,7 @@ hsa_status_t PcsRuntime::PcSamplingStop(hsa_ven_amd_pcs_t handle) {
}
hsa_status_t PcsRuntime::PcSamplingFlush(hsa_ven_amd_pcs_t handle) {
ScopedAcquire<KernelMutex> lock(&pc_sampling_lock_);
std::lock_guard<std::mutex> lock(pc_sampling_lock_);
auto pcSamplingSessionIt = pc_sampling_.find(static_cast<uint64_t>(handle.handle));
if (pcSamplingSessionIt == pc_sampling_.end()) {
debug_warning(false && "Cannot find PcSampling session");
@@ -166,7 +166,7 @@ class PcsRuntime {
}
// Map of pc sampling sessions indexed by hsa_ven_amd_pcs_t handle
std::map<uint64_t, PcSamplingSession> pc_sampling_;
KernelMutex pc_sampling_lock_;
std::mutex pc_sampling_lock_;
uint64_t pc_sampling_id_;
DISALLOW_COPY_AND_ASSIGN(PcsRuntime);