SWDEV-569319 Replace ScopedAcquire with stdcpp wrappers (#2146)
* SWDEV-569319 Replace ScopedAcquire with stdcpp wrappers * Remove KernelMutex and KernelSharedMutex abstractions with std::mutex and std::shared_mutex * Replaced unique_locks with lock_guards * More changes * Replace new and deletes with smart pointers * Replaced some more with shared ptrs * Replacements with smart pointers - pt 2 * missed change
Esse commit está contido em:
@@ -47,6 +47,7 @@
|
||||
|
||||
#include <assert.h>
|
||||
#include <vector>
|
||||
#include <mutex>
|
||||
|
||||
#include "core/inc/checked.h"
|
||||
#include "core/inc/isa.h"
|
||||
@@ -291,7 +292,7 @@ class Agent : public Checked<0xF6BC25EB17E6F917> {
|
||||
void* value) const = 0;
|
||||
|
||||
// @brief Returns an array of regions owned by the agent.
|
||||
virtual const std::vector<const core::MemoryRegion*>& regions() const = 0;
|
||||
virtual const std::vector<std::shared_ptr<const core::MemoryRegion>>& regions() const = 0;
|
||||
|
||||
// @brief Returns the ISA's supported by the agent.
|
||||
// @details The returned vector is a list of pointers to the supported ISA,
|
||||
@@ -336,7 +337,7 @@ class Agent : public Checked<0xF6BC25EB17E6F917> {
|
||||
__forceinline void Disable() { enabled_ = false; }
|
||||
|
||||
virtual void Trim() {
|
||||
for (auto region : regions()) region->Trim();
|
||||
for (const auto& region : regions()) region.get()->Trim();
|
||||
}
|
||||
|
||||
virtual void ReleaseResources() { }
|
||||
@@ -385,7 +386,7 @@ protected:
|
||||
// Serial memory operations are needed to ensure, among other things, that allocation failures are
|
||||
// due to true OOM conditions and per region caching (Trim and Allocate must be serial and
|
||||
// exclusive to ensure this).
|
||||
KernelMutex agent_memory_lock_;
|
||||
std::mutex agent_memory_lock_;
|
||||
|
||||
// Forbid copying and moving of this object
|
||||
DISALLOW_COPY_AND_ASSIGN(Agent);
|
||||
|
||||
@@ -82,7 +82,7 @@ public:
|
||||
/// @brief Override from core::Agent.
|
||||
const std::vector<const core::Isa*>& supported_isas() const override { return supported_isas_; }
|
||||
|
||||
const std::vector<const core::MemoryRegion*>& regions() const override { return regions_; }
|
||||
const std::vector<std::shared_ptr<const core::MemoryRegion>>& regions() const override { return regions_; }
|
||||
|
||||
/// @brief Getter for the AIE system allocator.
|
||||
const std::function<void*(size_t size, size_t align, core::MemoryRegion::AllocateFlags flags)>&
|
||||
@@ -101,7 +101,7 @@ private:
|
||||
/// @brief Setup the memory allocators used by this agent.
|
||||
void InitAllocators();
|
||||
|
||||
std::vector<const core::MemoryRegion *> regions_;
|
||||
std::vector<std::shared_ptr<const core::MemoryRegion>> regions_;
|
||||
std::function<void *(size_t size, size_t align,
|
||||
core::MemoryRegion::AllocateFlags flags)>
|
||||
system_allocator_;
|
||||
|
||||
@@ -306,7 +306,7 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo
|
||||
// GPU-visible indirect buffer holding PM4 commands.
|
||||
void* pm4_ib_buf_;
|
||||
uint32_t pm4_ib_size_b_;
|
||||
KernelMutex pm4_ib_mutex_;
|
||||
std::mutex pm4_ib_mutex_;
|
||||
|
||||
// Error handler control variable.
|
||||
std::atomic<uint32_t> dynamicScratchState, exceptionState;
|
||||
@@ -322,11 +322,11 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo
|
||||
Signal* exception_signal_;
|
||||
|
||||
// CU mask lock
|
||||
KernelMutex mask_lock_;
|
||||
std::mutex mask_lock_;
|
||||
|
||||
// Mutex to prevent AsyncReclaimScratch and HandleInsufficientScratch from
|
||||
// happening at the same time.
|
||||
KernelMutex scratch_lock_;
|
||||
std::mutex scratch_lock_;
|
||||
|
||||
// Current CU mask
|
||||
std::vector<uint32_t> cu_mask_;
|
||||
@@ -345,10 +345,10 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo
|
||||
}
|
||||
|
||||
// Mutex for queue_event_ manipulation
|
||||
KernelMutex& queue_lock() {
|
||||
std::mutex& queue_lock() {
|
||||
// This allocation is meant to last until the last thread has exited.
|
||||
// It is intentionally not freed.
|
||||
static KernelMutex* queue_lock_ = new KernelMutex();
|
||||
static std::mutex* queue_lock_ = new std::mutex();
|
||||
return *queue_lock_;
|
||||
}
|
||||
|
||||
|
||||
@@ -255,7 +255,7 @@ template <bool useGCR> class BlitSdma : public BlitSdmaBase {
|
||||
|
||||
// Internal signals for blocking APIs
|
||||
core::unique_signal_ptr signals_[2];
|
||||
KernelMutex lock_;
|
||||
std::mutex lock_;
|
||||
bool parity_;
|
||||
|
||||
/// Queue resource descriptor for doorbell, read
|
||||
|
||||
@@ -127,7 +127,7 @@ class CpuAgent : public core::Agent {
|
||||
}
|
||||
|
||||
// @brief Override from core::Agent.
|
||||
const std::vector<const core::MemoryRegion*>& regions() const override {
|
||||
const std::vector<std::shared_ptr<const core::MemoryRegion>>& regions() const override {
|
||||
return regions_;
|
||||
}
|
||||
|
||||
@@ -151,7 +151,7 @@ class CpuAgent : public core::Agent {
|
||||
// @retval ::HSA_STATUS_SUCCESS if the callback function for each traversed
|
||||
// region returns ::HSA_STATUS_SUCCESS.
|
||||
hsa_status_t VisitRegion(
|
||||
const std::vector<const core::MemoryRegion*>& regions,
|
||||
const std::vector<std::shared_ptr<const core::MemoryRegion>>& regions,
|
||||
hsa_status_t (*callback)(hsa_region_t region, void* data),
|
||||
void* data) const;
|
||||
|
||||
@@ -166,7 +166,7 @@ class CpuAgent : public core::Agent {
|
||||
std::vector<std::unique_ptr<core::Cache>> caches_;
|
||||
|
||||
// @brief Array of regions owned by this agent.
|
||||
std::vector<const core::MemoryRegion*> regions_;
|
||||
std::vector<std::shared_ptr<const core::MemoryRegion>> regions_;
|
||||
|
||||
DISALLOW_COPY_AND_ASSIGN(CpuAgent);
|
||||
};
|
||||
|
||||
@@ -394,7 +394,7 @@ class GpuAgent : public GpuAgentInt {
|
||||
}
|
||||
|
||||
// @brief Override from core::Agent.
|
||||
const std::vector<const core::MemoryRegion*>& regions() const override {
|
||||
const std::vector<std::shared_ptr<const core::MemoryRegion>>& regions() const override {
|
||||
return regions_;
|
||||
}
|
||||
|
||||
@@ -536,7 +536,7 @@ class GpuAgent : public GpuAgentInt {
|
||||
// @retval ::HSA_STATUS_SUCCESS if the callback function for each traversed
|
||||
// region returns ::HSA_STATUS_SUCCESS.
|
||||
hsa_status_t VisitRegion(
|
||||
const std::vector<const core::MemoryRegion*>& regions,
|
||||
const std::vector<std::shared_ptr<const core::MemoryRegion>>& regions,
|
||||
hsa_status_t (*callback)(hsa_region_t region, void* data),
|
||||
void* data) const;
|
||||
|
||||
@@ -594,7 +594,7 @@ class GpuAgent : public GpuAgentInt {
|
||||
std::vector<const core::Agent*> xgmi_peer_list_;
|
||||
|
||||
// Protects xgmi_peer_list_
|
||||
KernelMutex xgmi_peer_list_lock_;
|
||||
std::mutex xgmi_peer_list_lock_;
|
||||
|
||||
// @brief AQL queues for cache management and blit compute usage.
|
||||
enum QueueEnum {
|
||||
@@ -607,19 +607,19 @@ class GpuAgent : public GpuAgentInt {
|
||||
lazy_ptr<core::Queue> queues_[QueueCount];
|
||||
|
||||
// @brief Mutex to protect the update to coherency type.
|
||||
KernelMutex coherency_lock_;
|
||||
std::mutex coherency_lock_;
|
||||
|
||||
// @brief Mutex to protect access to scratch pool.
|
||||
KernelMutex scratch_lock_;
|
||||
std::mutex scratch_lock_;
|
||||
|
||||
// @brief Mutex to protect access to ::t1_.
|
||||
KernelMutex t1_lock_;
|
||||
std::mutex t1_lock_;
|
||||
|
||||
// @brief Mutex to protect access to blit objects.
|
||||
KernelMutex blit_lock_;
|
||||
std::mutex blit_lock_;
|
||||
|
||||
// @brief Mutex to protect sdma gang submissions.
|
||||
KernelMutex sdma_gang_lock_;
|
||||
std::mutex sdma_gang_lock_;
|
||||
|
||||
// @brief GPU tick on initialization.
|
||||
HsaClockCounters t0_;
|
||||
@@ -638,7 +638,7 @@ class GpuAgent : public GpuAgentInt {
|
||||
std::vector<std::unique_ptr<core::Cache>> caches_;
|
||||
|
||||
// @brief Array of regions owned by this agent.
|
||||
std::vector<const core::MemoryRegion*> regions_;
|
||||
std::vector<std::shared_ptr<const core::MemoryRegion>> regions_;
|
||||
|
||||
core::Isa* isa_;
|
||||
|
||||
@@ -729,7 +729,7 @@ class GpuAgent : public GpuAgentInt {
|
||||
struct {
|
||||
lazy_ptr<core::Queue> queue_;
|
||||
int ref_ct_;
|
||||
KernelMutex lock_;
|
||||
std::mutex lock_;
|
||||
} gws_queue_;
|
||||
|
||||
// @brief list of AQL queues owned by this agent. Indexed by queue pointer
|
||||
@@ -763,7 +763,7 @@ class GpuAgent : public GpuAgentInt {
|
||||
/// @brief Coarse-grain deallocator on this GPU.
|
||||
std::function<void(void*)> coarsegrain_deallocator_;
|
||||
|
||||
void* trap_handler_tma_region_;
|
||||
std::unique_ptr<void, std::function<void(void*)>> trap_handler_tma_region_;
|
||||
|
||||
/* PC Sampling fields - begin */
|
||||
/* 2nd level Trap handler code is based on the offsets within this structure */
|
||||
|
||||
@@ -181,7 +181,7 @@ namespace code {
|
||||
std::vector<Segment*> dataSegments;
|
||||
std::vector<Section*> dataSections;
|
||||
std::vector<RelocationSection*> relocationSections;
|
||||
std::vector<Symbol*> symbols;
|
||||
std::vector<std::shared_ptr<Symbol>> symbols;
|
||||
bool combineDataSegments;
|
||||
Segment* hsaSegments[AMDGPU_HSA_SEGMENT_LAST][2];
|
||||
Section* hsaSections[AMDGPU_HSA_SECTION_LAST];
|
||||
@@ -234,7 +234,7 @@ namespace code {
|
||||
uint32_t OsAbi() const { return img->OsAbi(); }
|
||||
|
||||
AmdHsaCode(bool combineDataSegments = true);
|
||||
virtual ~AmdHsaCode();
|
||||
virtual ~AmdHsaCode() = default;
|
||||
|
||||
std::string output() { return out.str(); }
|
||||
bool LoadFromFile(const std::string& filename);
|
||||
@@ -347,7 +347,7 @@ namespace code {
|
||||
RelocationSection* GetRelocationSection(size_t i) { return relocationSections[i]; }
|
||||
|
||||
size_t SymbolCount() { return symbols.size(); }
|
||||
Symbol* GetSymbol(size_t i) { return symbols[i]; }
|
||||
Symbol* GetSymbol(size_t i) { return symbols[i].get(); }
|
||||
Symbol* GetSymbolByElfIndex(size_t index);
|
||||
Symbol* FindSymbol(const std::string &n);
|
||||
|
||||
@@ -362,11 +362,11 @@ namespace code {
|
||||
|
||||
class AmdHsaCodeManager {
|
||||
private:
|
||||
typedef std::unordered_map<uint64_t, AmdHsaCode*> CodeMap;
|
||||
typedef std::unordered_map<uint64_t, std::shared_ptr<AmdHsaCode>> CodeMap;
|
||||
CodeMap codeMap;
|
||||
|
||||
public:
|
||||
AmdHsaCode* FromHandle(hsa_code_object_t handle);
|
||||
const std::shared_ptr<AmdHsaCode>& FromHandle(hsa_code_object_t handle);
|
||||
bool Destroy(hsa_code_object_t handle);
|
||||
};
|
||||
|
||||
|
||||
@@ -422,7 +422,7 @@ private:
|
||||
Executable(const Executable &e);
|
||||
Executable& operator=(const Executable &e);
|
||||
|
||||
static std::vector<Executable*> executables;
|
||||
static std::vector<std::shared_ptr<Executable>> executables;
|
||||
static std::mutex executables_mutex;
|
||||
};
|
||||
|
||||
|
||||
@@ -187,7 +187,7 @@ private:
|
||||
|
||||
// Protects against concurrent allow_access calls to fragments of the same block by virtue of all
|
||||
// fragments of the block routing to the same MemoryRegion.
|
||||
mutable KernelMutex access_lock_;
|
||||
mutable std::mutex access_lock_;
|
||||
|
||||
static const size_t kPageSize_;
|
||||
|
||||
|
||||
@@ -216,7 +216,7 @@ class InterceptQueue : public QueueProxy, private LocalSignal, public DoorbellSi
|
||||
|
||||
private:
|
||||
// Serialize packet interception processing.
|
||||
KernelMutex lock_;
|
||||
std::mutex lock_;
|
||||
|
||||
// Largest processed packet index.
|
||||
uint64_t next_packet_;
|
||||
|
||||
@@ -103,7 +103,7 @@ class IPCSignal : private SharedMemorySignal, public BusyWaitSignal {
|
||||
static int rtti_id_ = 0;
|
||||
return rtti_id_;
|
||||
}
|
||||
static KernelMutex lock_;
|
||||
static std::mutex lock_;
|
||||
|
||||
explicit IPCSignal(SharedMemorySignal&& abi_block)
|
||||
: SharedMemorySignal(std::move(abi_block)), BusyWaitSignal(signal(), true) {}
|
||||
|
||||
@@ -51,6 +51,7 @@
|
||||
#include <tuple>
|
||||
#include <utility>
|
||||
#include <thread>
|
||||
#include <shared_mutex>
|
||||
#if defined(__linux__)
|
||||
#include <sys/un.h>
|
||||
#include <xf86drm.h>
|
||||
@@ -437,15 +438,15 @@ class Runtime {
|
||||
|
||||
Agent* region_gpu() { return region_gpu_; }
|
||||
|
||||
const std::vector<const MemoryRegion*>& system_regions_fine() const {
|
||||
const std::vector<std::shared_ptr<const MemoryRegion>>& system_regions_fine() const {
|
||||
return system_regions_fine_;
|
||||
}
|
||||
|
||||
const std::vector<const MemoryRegion*>& system_regions_coarse() const {
|
||||
const std::vector<std::shared_ptr<const MemoryRegion>>& system_regions_coarse() const {
|
||||
return system_regions_coarse_;
|
||||
}
|
||||
|
||||
amd::hsa::loader::Loader* loader() { return loader_; }
|
||||
amd::hsa::loader::Loader* loader() { return loader_.get(); }
|
||||
|
||||
amd::LoaderContext* loader_context() { return &loader_context_; }
|
||||
|
||||
@@ -719,10 +720,10 @@ class Runtime {
|
||||
|
||||
// Will be created before any user could call hsa_init but also could be
|
||||
// destroyed before incorrectly written programs call hsa_shutdown.
|
||||
static __forceinline KernelMutex& bootstrap_lock() {
|
||||
static __forceinline std::mutex& bootstrap_lock() {
|
||||
// This allocation is meant to last until the last thread has exited.
|
||||
// It is intentionally not freed.
|
||||
static KernelMutex* bootstrap_lock_ = new KernelMutex;
|
||||
static std::mutex* bootstrap_lock_ = new std::mutex;
|
||||
return *bootstrap_lock_;
|
||||
}
|
||||
Runtime();
|
||||
@@ -780,7 +781,7 @@ class Runtime {
|
||||
// Also ensures atomicity of pointer info queries by interlocking
|
||||
// KFD map/unmap, register/unregister, and access to hsaKmtQueryPointerInfo
|
||||
// registered & mapped arrays.
|
||||
KernelSharedMutex memory_lock_;
|
||||
std::shared_mutex memory_lock_;
|
||||
|
||||
// Array containing driver interfaces for compatible agent kernel-mode
|
||||
// drivers. Currently supports AIE agents.
|
||||
@@ -811,16 +812,16 @@ class Runtime {
|
||||
std::vector<uint32_t> gpu_ids_;
|
||||
|
||||
// List of all fine grain system memory region in the platform.
|
||||
std::vector<const MemoryRegion*> system_regions_fine_;
|
||||
std::vector<std::shared_ptr<const MemoryRegion>> system_regions_fine_;
|
||||
|
||||
// List of all coarse grain system memory region in the platform.
|
||||
std::vector<const MemoryRegion*> system_regions_coarse_;
|
||||
std::vector<std::shared_ptr<const MemoryRegion>> system_regions_coarse_;
|
||||
|
||||
// Matrix of IO link.
|
||||
std::vector<LinkInfo> link_matrix_;
|
||||
|
||||
// Loader instance.
|
||||
amd::hsa::loader::Loader* loader_;
|
||||
std::unique_ptr<amd::hsa::loader::Loader> loader_;
|
||||
|
||||
// Loader context.
|
||||
amd::LoaderContext loader_context_;
|
||||
@@ -832,7 +833,7 @@ class Runtime {
|
||||
std::map<const void*, AllocationRegion> allocation_map_;
|
||||
|
||||
// Pending prefetch containers.
|
||||
KernelMutex prefetch_lock_;
|
||||
std::mutex prefetch_lock_;
|
||||
prefetch_map_t prefetch_map_;
|
||||
|
||||
// Allocator using ::system_region_
|
||||
@@ -853,24 +854,29 @@ class Runtime {
|
||||
// Number of Numa Nodes
|
||||
size_t num_nodes_;
|
||||
|
||||
struct HsaEventDeleter {
|
||||
void operator()(HsaEvent* event) { InterruptSignal::DestroyEvent(event); }
|
||||
};
|
||||
using unique_hsa_event_ptr = std::unique_ptr<HsaEvent, HsaEventDeleter>;
|
||||
|
||||
// @brief AMD HSA event to monitor for virtual memory access fault.
|
||||
HsaEvent* vm_fault_event_;
|
||||
unique_hsa_event_ptr vm_fault_event_;
|
||||
|
||||
// @brief HSA signal to contain the VM fault event.
|
||||
Signal* vm_fault_signal_;
|
||||
unique_signal_ptr vm_fault_signal_;
|
||||
|
||||
// @brief AMD HSA event to monitor for HW exceptions.
|
||||
HsaEvent* hw_exception_event_;
|
||||
unique_hsa_event_ptr hw_exception_event_;
|
||||
|
||||
// @brief HSA signal to contain the HW exceptionevent.
|
||||
Signal* hw_exception_signal_;
|
||||
unique_signal_ptr hw_exception_signal_;
|
||||
|
||||
// Custom system event handlers.
|
||||
std::vector<std::pair<AMD::callback_t<hsa_amd_system_event_callback_t>, void*>>
|
||||
system_event_handlers_;
|
||||
|
||||
// System event handler lock
|
||||
KernelMutex system_event_lock_;
|
||||
std::mutex system_event_lock_;
|
||||
|
||||
// Internal queue creation notifier
|
||||
AMD::callback_t<hsa_amd_runtime_queue_notifier> internal_queue_create_notifier_;
|
||||
@@ -898,8 +904,8 @@ class Runtime {
|
||||
|
||||
// IPC DMA buf unix domain socket server dmabuf FD passing
|
||||
int ipc_sock_server_fd_;
|
||||
std::map<uint64_t, size_t> ipc_sock_server_conns_;
|
||||
KernelMutex ipc_sock_server_lock_;
|
||||
std::map<uint64_t, int> ipc_sock_server_conns_;
|
||||
std::mutex ipc_sock_server_lock_;
|
||||
|
||||
private:
|
||||
void CheckVirtualMemApiSupport();
|
||||
|
||||
@@ -50,6 +50,7 @@
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
#include <mutex>
|
||||
|
||||
#include "hsakmt/hsakmt.h"
|
||||
|
||||
@@ -499,7 +500,7 @@ class Signal {
|
||||
core::Agent* async_copy_agent_;
|
||||
|
||||
private:
|
||||
static KernelMutex ipcLock_;
|
||||
static std::mutex ipcLock_;
|
||||
static std::map<decltype(hsa_signal_t::handle), Signal*> ipcMap_;
|
||||
|
||||
static Signal* lookupIpc(hsa_signal_t signal);
|
||||
|
||||
@@ -66,7 +66,6 @@ AieAgent::AieAgent(uint32_t node, const HsaNodeProperties& node_props)
|
||||
}
|
||||
|
||||
AieAgent::~AieAgent() {
|
||||
std::for_each(regions_.begin(), regions_.end(), DeleteObject());
|
||||
regions_.clear();
|
||||
}
|
||||
|
||||
@@ -75,8 +74,8 @@ hsa_status_t AieAgent::VisitRegion(bool include_peer,
|
||||
void *data),
|
||||
void *data) const {
|
||||
AMD::callback_t<decltype(callback)> call(callback);
|
||||
for (const auto r : regions_) {
|
||||
hsa_region_t region_handle(core::MemoryRegion::Convert(r));
|
||||
for (const auto& r : regions_) {
|
||||
hsa_region_t region_handle(core::MemoryRegion::Convert(r.get()));
|
||||
hsa_status_t err = call(region_handle, data);
|
||||
if (err != HSA_STATUS_SUCCESS) {
|
||||
return err;
|
||||
@@ -321,24 +320,25 @@ void AieAgent::InitRegionList() {
|
||||
/// explicit sync operations.
|
||||
regions_.reserve(3);
|
||||
regions_.push_back(
|
||||
new MemoryRegion(false, true, false, false, true, this, sys_mem_props));
|
||||
std::make_shared<MemoryRegion>(false, true, false, false, true, this, sys_mem_props));
|
||||
regions_.push_back(
|
||||
new MemoryRegion(false, false, false, false, true, this, dev_mem_props));
|
||||
regions_.push_back(new MemoryRegion(false, false, false, false, true, this,
|
||||
other_mem_props));
|
||||
std::make_shared<MemoryRegion>(false, false, false, false, true, this, dev_mem_props));
|
||||
regions_.push_back(
|
||||
std::make_shared<MemoryRegion>(false, false, false, false, true, this, other_mem_props));
|
||||
}
|
||||
|
||||
void AieAgent::InitAllocators() {
|
||||
for (const auto *region : regions()) {
|
||||
for (const auto& region : regions()) {
|
||||
const MemoryRegion *amd_mem_region(
|
||||
static_cast<const MemoryRegion *>(region));
|
||||
static_cast<const MemoryRegion *>(region.get()));
|
||||
if (amd_mem_region->kernarg()) {
|
||||
const core::MemoryRegion* region_ptr = region.get();
|
||||
system_allocator_ =
|
||||
[region](size_t size, size_t align,
|
||||
[region_ptr](size_t size, size_t align,
|
||||
core::MemoryRegion::AllocateFlags alloc_flags) -> void * {
|
||||
void *mem(nullptr);
|
||||
return (core::Runtime::runtime_singleton_->AllocateMemory(
|
||||
region, size, alloc_flags, &mem) == HSA_STATUS_SUCCESS)
|
||||
region_ptr, size, alloc_flags, &mem) == HSA_STATUS_SUCCESS)
|
||||
? mem
|
||||
: nullptr;
|
||||
};
|
||||
|
||||
@@ -165,8 +165,8 @@ AqlQueue::AqlQueue(core::SharedQueue* shared_queue, GpuAgent* agent, size_t req_
|
||||
// Set group and private memory apertures in amd_queue_.
|
||||
auto& regions = agent->regions();
|
||||
|
||||
for (auto region : regions) {
|
||||
const MemoryRegion* amdregion = static_cast<const AMD::MemoryRegion*>(region);
|
||||
for (const auto& region : regions) {
|
||||
const MemoryRegion* amdregion = static_cast<const AMD::MemoryRegion*>(region.get());
|
||||
uint64_t base = amdregion->GetBaseAddress();
|
||||
|
||||
if (amdregion->IsLDS()) {
|
||||
@@ -217,7 +217,7 @@ AqlQueue::AqlQueue(core::SharedQueue* shared_queue, GpuAgent* agent, size_t req_
|
||||
}
|
||||
|
||||
MAKE_NAMED_SCOPE_GUARD(EventGuard, [&]() {
|
||||
ScopedAcquire<KernelMutex> _lock(&queue_lock());
|
||||
std::lock_guard<std::mutex> _lock(queue_lock());
|
||||
queue_count()--;
|
||||
if (queue_count() == 0) {
|
||||
core::InterruptSignal::DestroyEvent(queue_event());
|
||||
@@ -232,7 +232,7 @@ AqlQueue::AqlQueue(core::SharedQueue* shared_queue, GpuAgent* agent, size_t req_
|
||||
});
|
||||
|
||||
if (core::g_use_interrupt_wait) {
|
||||
ScopedAcquire<KernelMutex> _lock(&queue_lock());
|
||||
std::lock_guard<std::mutex> _lock(queue_lock());
|
||||
queue_count()++;
|
||||
if (queue_event() == nullptr) {
|
||||
assert(queue_count() == 1 && "Inconsistency in queue event reference counting found.\n");
|
||||
@@ -387,7 +387,7 @@ AqlQueue::~AqlQueue() {
|
||||
FreeQueueMemory();
|
||||
|
||||
if (core::g_use_interrupt_wait) {
|
||||
ScopedAcquire<KernelMutex> lock(&queue_lock());
|
||||
std::lock_guard<std::mutex> lock(queue_lock());
|
||||
queue_count()--;
|
||||
if (queue_count() == 0) {
|
||||
core::InterruptSignal::DestroyEvent(queue_event());
|
||||
@@ -777,7 +777,7 @@ void AqlQueue::AsyncReclaimMainScratch() {
|
||||
tool::notify_event_scratch_async_reclaim_start(public_handle(),
|
||||
HSA_AMD_EVENT_SCRATCH_ALLOC_FLAG_NONE);
|
||||
|
||||
ScopedAcquire<KernelMutex> lock(&scratch_lock_);
|
||||
std::lock_guard<std::mutex> lock(scratch_lock_);
|
||||
|
||||
// Unmap the queue. CP will check amd_queue_ fields on re-map
|
||||
Suspend();
|
||||
@@ -849,7 +849,7 @@ void AqlQueue::AsyncReclaimAltScratch() {
|
||||
tool::notify_event_scratch_async_reclaim_start(public_handle(),
|
||||
HSA_AMD_EVENT_SCRATCH_ALLOC_FLAG_ALT);
|
||||
|
||||
ScopedAcquire<KernelMutex> lock(&scratch_lock_);
|
||||
std::lock_guard<std::mutex> lock(scratch_lock_);
|
||||
|
||||
// Unmap the queue. CP will check amd_queue_ fields on re-map
|
||||
Suspend();
|
||||
@@ -1014,7 +1014,7 @@ void AqlQueue::HandleInsufficientScratch(hsa_signal_value_t& error_code,
|
||||
const uint64_t device_size = size_per_thread * lanes_per_wave * device_slots;
|
||||
const uint64_t dispatch_size = size_per_thread * lanes_per_wave * dispatch_slots;
|
||||
|
||||
ScopedAcquire<KernelMutex> lock(&scratch_lock_);
|
||||
std::lock_guard<std::mutex> lock(scratch_lock_);
|
||||
|
||||
// scratch.use_alt_limit will be 0 if alt scratch is not supported or disabled
|
||||
if (dispatch_size < scratch.use_alt_limit && dispatch_slots < device_slots) {
|
||||
@@ -1393,7 +1393,7 @@ hsa_status_t AqlQueue::SetCUMasking(uint32_t num_cu_mask_count, const uint32_t*
|
||||
if ((mask.size() == mask_dwords) && (tail_mask != 0)) mask[mask_dwords - 1] &= tail_mask;
|
||||
|
||||
// Apply mask if non-default or not queue initialization.
|
||||
ScopedAcquire<KernelMutex> lock(&mask_lock_);
|
||||
std::lock_guard<std::mutex> lock(mask_lock_);
|
||||
if ((!cu_mask_.empty()) || (num_cu_mask_count != 0) || (!global_mask.empty())) {
|
||||
|
||||
// Devices with WGPs must conform to even-indexed contiguous pairwise CU enablement.
|
||||
@@ -1414,7 +1414,7 @@ hsa_status_t AqlQueue::SetCUMasking(uint32_t num_cu_mask_count, const uint32_t*
|
||||
}
|
||||
|
||||
hsa_status_t AqlQueue::GetCUMasking(uint32_t num_cu_mask_count, uint32_t* cu_mask) {
|
||||
ScopedAcquire<KernelMutex> lock(&mask_lock_);
|
||||
std::lock_guard<std::mutex> lock(mask_lock_);
|
||||
assert(!cu_mask_.empty() && "No current cu_mask!");
|
||||
|
||||
uint32_t user_dword_count = num_cu_mask_count / 32;
|
||||
@@ -1440,7 +1440,7 @@ void AqlQueue::SetProfiling(bool enabled) {
|
||||
void AqlQueue::ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b, hsa_fence_scope_t acquireFence,
|
||||
hsa_fence_scope_t releaseFence, hsa_signal_t* in_signal) {
|
||||
// pm4_ib_buf_ is a shared resource, so mutually exclude here.
|
||||
ScopedAcquire<KernelMutex> lock(&pm4_ib_mutex_);
|
||||
std::lock_guard<std::mutex> lock(pm4_ib_mutex_);
|
||||
|
||||
// Obtain reference to any container queue.
|
||||
core::Queue* queue = core::Queue::Convert(public_handle());
|
||||
|
||||
@@ -293,7 +293,7 @@ static bool DepSignalCompleteHandler(hsa_signal_value_t signal_value, void *arg
|
||||
template <bool useGCR>
|
||||
hsa_status_t BlitSdma<useGCR>::SubmitBlockingCommand(const void* cmd, size_t cmd_size,
|
||||
uint64_t size) {
|
||||
ScopedAcquire<KernelMutex> lock(&lock_);
|
||||
std::unique_lock<std::mutex> lock(lock_);
|
||||
|
||||
// Alternate between completion signals
|
||||
// Using two allows overlapping command writing and copies
|
||||
@@ -310,7 +310,7 @@ hsa_status_t BlitSdma<useGCR>::SubmitBlockingCommand(const void* cmd, size_t cmd
|
||||
// Mark signal as in use, guard against exception leaving the signal in an unusable state.
|
||||
completionSignal->StoreRelaxed(2);
|
||||
MAKE_SCOPE_GUARD([&]() { completionSignal->StoreRelaxed(0); });
|
||||
lock.Release();
|
||||
lock.unlock();
|
||||
|
||||
std::vector<core::Signal*> gang_signals(0);
|
||||
|
||||
|
||||
@@ -64,7 +64,6 @@ CpuAgent::CpuAgent(HSAuint32 node, const HsaNodeProperties& node_props,
|
||||
}
|
||||
|
||||
CpuAgent::~CpuAgent() {
|
||||
std::for_each(regions_.begin(), regions_.end(), DeleteObject());
|
||||
regions_.clear();
|
||||
}
|
||||
|
||||
@@ -87,17 +86,17 @@ void CpuAgent::InitRegionList() {
|
||||
if (system_prop != mem_props.end()) system_props = *system_prop;
|
||||
|
||||
// Fine-Grain Memory
|
||||
regions_.push_back(new MemoryRegion(true, false, is_apu_node, false, true, this, system_props));
|
||||
regions_.push_back(std::make_shared<MemoryRegion>(true, false, is_apu_node, false, true, this, system_props));
|
||||
|
||||
// Ext-Fine-Grain Memory
|
||||
regions_.push_back(new MemoryRegion(false, false, is_apu_node, true, true, this, system_props));
|
||||
regions_.push_back(std::make_shared<MemoryRegion>(false, false, is_apu_node, true, true, this, system_props));
|
||||
|
||||
// Kernargs
|
||||
regions_.push_back(new MemoryRegion(true, true, is_apu_node, false, true, this, system_props));
|
||||
regions_.push_back(std::make_shared<MemoryRegion>(true, true, is_apu_node, false, true, this, system_props));
|
||||
|
||||
if (!is_apu_node) {
|
||||
// Coarse Grain
|
||||
regions_.push_back(new MemoryRegion(false, false, is_apu_node, false, true, this, system_props));
|
||||
regions_.push_back(std::make_shared<MemoryRegion>(false, false, is_apu_node, false, true, this, system_props));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -150,12 +149,12 @@ hsa_status_t CpuAgent::VisitRegion(bool include_peer,
|
||||
}
|
||||
|
||||
hsa_status_t CpuAgent::VisitRegion(
|
||||
const std::vector<const core::MemoryRegion*>& regions,
|
||||
const std::vector<std::shared_ptr<const core::MemoryRegion>>& regions,
|
||||
hsa_status_t (*callback)(hsa_region_t region, void* data),
|
||||
void* data) const {
|
||||
for (const core::MemoryRegion* region : regions) {
|
||||
for (const std::shared_ptr<const rocr::core::MemoryRegion>& region : regions) {
|
||||
if (!region->user_visible()) continue;
|
||||
hsa_region_t region_handle = core::MemoryRegion::Convert(region);
|
||||
hsa_region_t region_handle = core::MemoryRegion::Convert(region.get());
|
||||
hsa_status_t status = callback(region_handle, data);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
return status;
|
||||
|
||||
@@ -112,7 +112,9 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xna
|
||||
scratch_limit_async_threshold_(0),
|
||||
scratch_cache_(
|
||||
[this](void* base, size_t size, bool large) { ReleaseScratch(base, size, large); }),
|
||||
trap_handler_tma_region_(NULL),
|
||||
trap_handler_tma_region_(nullptr, [this](void* ptr){
|
||||
if (ptr && this->finegrain_allocator_) this->finegrain_deallocator()(ptr);
|
||||
}),
|
||||
rec_sdma_eng_override_(false),
|
||||
pcs_hosttrap_data_(),
|
||||
pcs_stochastic_data_(),
|
||||
@@ -246,7 +248,6 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xna
|
||||
GpuAgent::~GpuAgent() {
|
||||
for (auto& blit : blits_) blit.reset();
|
||||
|
||||
std::for_each(regions_.begin(), regions_.end(), DeleteObject());
|
||||
regions_.clear();
|
||||
}
|
||||
|
||||
@@ -454,22 +455,20 @@ void GpuAgent::InitRegionList() {
|
||||
memory_max_frequency_ = mem_props[mem_idx].MemoryClockMax;
|
||||
case HSA_HEAPTYPE_GPU_LDS:
|
||||
case HSA_HEAPTYPE_GPU_SCRATCH: {
|
||||
MemoryRegion* region =
|
||||
new MemoryRegion(false, false, false, false, true, this, mem_props[mem_idx]);
|
||||
|
||||
std::shared_ptr<MemoryRegion> region = std::make_shared<MemoryRegion>(false, false, false, false, true, this, mem_props[mem_idx]);
|
||||
regions_.push_back(region);
|
||||
|
||||
if (region->IsLocalMemory()) {
|
||||
// Extended Fine-Grain memory
|
||||
if (!(isa_->GetMajorVersion() == 12 && isa_->GetMinorVersion() == 0))
|
||||
regions_.push_back(
|
||||
new MemoryRegion(false, false, false, true, true, this, mem_props[mem_idx]));
|
||||
std::make_shared<MemoryRegion>(false, false, false, true, true, this, mem_props[mem_idx]));
|
||||
|
||||
// Expose VRAM as uncached/fine grain over PCIe (if enabled) or XGMI.
|
||||
bool user_visible = (properties_.HiveID != 0) ||
|
||||
core::Runtime::runtime_singleton_->flag().fine_grain_pcie();
|
||||
|
||||
regions_.push_back(new MemoryRegion(true, false, false, false, user_visible, this,
|
||||
regions_.push_back(std::make_shared<MemoryRegion>(true, false, false, false, user_visible, this,
|
||||
mem_props[mem_idx]));
|
||||
}
|
||||
break;
|
||||
@@ -561,7 +560,7 @@ void GpuAgent::ReserveScratch()
|
||||
size_t available;
|
||||
hsa_status_t err = driver().AvailableMemory(node_id(), &available);
|
||||
assert(err == HSA_STATUS_SUCCESS && "AvailableMemory failed");
|
||||
ScopedAcquire<KernelMutex> lock(&scratch_lock_);
|
||||
std::lock_guard<std::mutex> lock(scratch_lock_);
|
||||
if (!scratch_cache_.reserved_bytes() && reserved_sz && available > 8 * reserved_sz) {
|
||||
HSAuint64 alt_va;
|
||||
void* reserved_base = scratch_pool_.alloc(reserved_sz);
|
||||
@@ -676,20 +675,20 @@ hsa_status_t GpuAgent::VisitRegion(bool include_peer,
|
||||
}
|
||||
|
||||
hsa_status_t GpuAgent::VisitRegion(
|
||||
const std::vector<const core::MemoryRegion*>& regions,
|
||||
const std::vector<std::shared_ptr<const core::MemoryRegion>>& regions,
|
||||
hsa_status_t (*callback)(hsa_region_t region, void* data),
|
||||
void* data) const {
|
||||
AMD::callback_t<decltype(callback)> call(callback);
|
||||
for (const core::MemoryRegion* region : regions) {
|
||||
for (const auto& region : regions) {
|
||||
if (!region->user_visible()) continue;
|
||||
|
||||
const AMD::MemoryRegion* amd_region =
|
||||
reinterpret_cast<const AMD::MemoryRegion*>(region);
|
||||
reinterpret_cast<const AMD::MemoryRegion*>(region.get());
|
||||
|
||||
// Only expose system, local, and LDS memory.
|
||||
if (amd_region->IsSystem() || amd_region->IsLocalMemory() ||
|
||||
amd_region->IsLDS()) {
|
||||
hsa_region_t region_handle = core::MemoryRegion::Convert(region);
|
||||
hsa_region_t region_handle = core::MemoryRegion::Convert(region.get());
|
||||
hsa_status_t status = call(region_handle, data);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
return status;
|
||||
@@ -910,7 +909,7 @@ void GpuAgent::InitGWS() {
|
||||
}
|
||||
|
||||
void GpuAgent::GWSRelease() {
|
||||
ScopedAcquire<KernelMutex> lock(&gws_queue_.lock_);
|
||||
std::lock_guard<std::mutex> lock(gws_queue_.lock_);
|
||||
gws_queue_.ref_ct_--;
|
||||
if (gws_queue_.ref_ct_ != 0) return;
|
||||
InitGWS();
|
||||
@@ -968,22 +967,22 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, const void* src, size_t size) {
|
||||
}
|
||||
|
||||
void GpuAgent::SetCopyRequestRefCount(bool set) {
|
||||
ScopedAcquire<KernelMutex> lock(&blit_lock_);
|
||||
std::unique_lock<std::mutex> lock(blit_lock_);
|
||||
while (pending_copy_stat_check_ref_) {
|
||||
blit_lock_.Release();
|
||||
lock.unlock();
|
||||
os::YieldThread();
|
||||
blit_lock_.Acquire();
|
||||
lock.lock();
|
||||
}
|
||||
if (!set && pending_copy_req_ref_) pending_copy_req_ref_--;
|
||||
else pending_copy_req_ref_++;
|
||||
}
|
||||
|
||||
void GpuAgent::SetCopyStatusCheckRefCount(bool set) {
|
||||
ScopedAcquire<KernelMutex> lock(&blit_lock_);
|
||||
std::unique_lock<std::mutex> lock(blit_lock_);
|
||||
while (pending_copy_req_ref_) {
|
||||
blit_lock_.Release();
|
||||
lock.unlock();
|
||||
os::YieldThread();
|
||||
blit_lock_.Acquire();
|
||||
lock.lock();
|
||||
}
|
||||
if (!set && pending_copy_stat_check_ref_) pending_copy_stat_check_ref_--;
|
||||
else pending_copy_stat_check_ref_++;
|
||||
@@ -1059,7 +1058,7 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent,
|
||||
std::min(gang_factor, properties_.NumSdmaXgmiEngines);
|
||||
}
|
||||
|
||||
ScopedAcquire<KernelMutex> lock(&sdma_gang_lock_);
|
||||
std::lock_guard<std::mutex> lock(sdma_gang_lock_);
|
||||
// Manage internal gang signals
|
||||
std::vector<core::Signal*> gang_signals;
|
||||
if (gang_factor > 1) {
|
||||
@@ -1642,7 +1641,7 @@ hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const {
|
||||
|
||||
if (status != HSA_STATUS_SUCCESS) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
|
||||
for (auto r : regions()) availableBytes += ((AMD::MemoryRegion*)r)->GetCacheSize();
|
||||
for (const auto& r : regions()) availableBytes += ((AMD::MemoryRegion*)(r.get()))->GetCacheSize();
|
||||
|
||||
availableBytes += scratch_cache_.free_bytes() - scratch_cache_.reserved_bytes();
|
||||
|
||||
@@ -1730,7 +1729,7 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type, u
|
||||
core::Queue** queue) {
|
||||
// Handle GWS queues.
|
||||
if (queue_type == HSA_QUEUE_TYPE_COOPERATIVE) {
|
||||
ScopedAcquire<KernelMutex> lock(&gws_queue_.lock_);
|
||||
std::lock_guard<std::mutex> lock(gws_queue_.lock_);
|
||||
auto ret = (*gws_queue_.queue_).get();
|
||||
if (ret != nullptr) {
|
||||
gws_queue_.ref_ct_++;
|
||||
@@ -1876,7 +1875,7 @@ void GpuAgent::AcquireQueueMainScratch(ScratchInfo& scratch) {
|
||||
*/
|
||||
bool large;
|
||||
|
||||
ScopedAcquire<KernelMutex> lock(&scratch_lock_);
|
||||
std::lock_guard<std::mutex> lock(scratch_lock_);
|
||||
const size_t small_limit = scratch_pool_.size() >> 3;
|
||||
bool use_reclaim = true;
|
||||
|
||||
@@ -2035,7 +2034,7 @@ void GpuAgent::AcquireQueueAltScratch(ScratchInfo& scratch) {
|
||||
uint64_t size_per_wave = AlignUp(scratch.alt_size_per_thread * properties_.WaveFrontSize, 1024);
|
||||
if (size_per_wave > MAX_WAVE_SCRATCH) return;
|
||||
|
||||
ScopedAcquire<KernelMutex> lock(&scratch_lock_);
|
||||
std::lock_guard<std::mutex> lock(scratch_lock_);
|
||||
|
||||
// Ensure mapping will be in whole pages.
|
||||
scratch.alt_size = AlignUp(scratch.alt_size, 4096);
|
||||
@@ -2176,7 +2175,7 @@ uint64_t GpuAgent::TranslateTime(uint64_t tick) {
|
||||
// Limit errors due to relative frequency drift to ~0.5us. Sync clocks at 16Hz.
|
||||
const int64_t max_extrapolation = core::Runtime::runtime_singleton_->sys_clock_freq() >> 4;
|
||||
|
||||
ScopedAcquire<KernelMutex> lock(&t1_lock_);
|
||||
std::lock_guard<std::mutex> lock(t1_lock_);
|
||||
// Limit errors due to correlated pair certainty to ~0.5us.
|
||||
// extrapolated time < (0.5us / half clock read certainty) * delay between clock measures
|
||||
// clock read certainty is <4us.
|
||||
@@ -2261,26 +2260,27 @@ hsa_status_t GpuAgent::UpdateTrapHandlerWithPCS(pcs_sampling_data_t* pcs_hosttra
|
||||
((uint64_t*)tma_region_host)[1] = (uint64_t)pcs_stochastic_buffers;
|
||||
|
||||
if (!trap_handler_tma_region_) {
|
||||
trap_handler_tma_region_ = (uint64_t*)finegrain_allocator()(2 * sizeof(uint64_t), 0);
|
||||
if (trap_handler_tma_region_ == nullptr) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
void* mem = (uint64_t*)finegrain_allocator()(2 * sizeof(uint64_t), 0);
|
||||
if (!mem) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
|
||||
trap_handler_tma_region_.reset(mem);
|
||||
|
||||
// NearestCpuAgent owns pool returned system_allocator()
|
||||
auto cpuAgent = GetNearestCpuAgent()->public_handle();
|
||||
|
||||
hsa_status_t ret =
|
||||
AMD::hsa_amd_agents_allow_access(1, &cpuAgent, NULL, trap_handler_tma_region_);
|
||||
AMD::hsa_amd_agents_allow_access(1, &cpuAgent, NULL, trap_handler_tma_region_.get());
|
||||
assert(ret == HSA_STATUS_SUCCESS);
|
||||
}
|
||||
|
||||
/* On non-large BAR systems, we may not be able to access device memory, so do a DmaCopy */
|
||||
if (DmaCopy(trap_handler_tma_region_, tma_region_host, 2 * sizeof(uint64_t)) != HSA_STATUS_SUCCESS)
|
||||
if (DmaCopy(trap_handler_tma_region_.get(), tma_region_host, 2 * sizeof(uint64_t)) != HSA_STATUS_SUCCESS)
|
||||
return HSA_STATUS_ERROR;
|
||||
|
||||
tma_size = 2 * sizeof(uint64_t);
|
||||
tma_addr = trap_handler_tma_region_;
|
||||
tma_addr = trap_handler_tma_region_.get();
|
||||
} else if (trap_handler_tma_region_) {
|
||||
finegrain_deallocator()(trap_handler_tma_region_);
|
||||
trap_handler_tma_region_ = NULL;
|
||||
trap_handler_tma_region_.reset(nullptr);
|
||||
}
|
||||
|
||||
// Bind the trap handler to this node.
|
||||
@@ -2398,7 +2398,7 @@ lazy_ptr<core::Blit>& GpuAgent::GetXgmiBlit(const core::Agent& dst_agent) {
|
||||
uint32_t xgmi_engine_cnt = properties_.NumSdmaXgmiEngines;
|
||||
assert((xgmi_engine_cnt > 0) && ("Illegal condition, should not happen"));
|
||||
|
||||
ScopedAcquire<KernelMutex> lock(&xgmi_peer_list_lock_);
|
||||
std::lock_guard<std::mutex> lock(xgmi_peer_list_lock_);
|
||||
|
||||
for (uint32_t idx = 0; idx < xgmi_peer_list_.size(); idx++) {
|
||||
uint64_t dst_handle = dst_agent.public_handle().handle;
|
||||
@@ -2490,19 +2490,20 @@ lazy_ptr<core::Blit>& GpuAgent::GetBlitObject(const core::Agent& dst_agent,
|
||||
void GpuAgent::Trim() {
|
||||
Agent::Trim();
|
||||
AsyncReclaimScratchQueues();
|
||||
ScopedAcquire<KernelMutex> lock(&scratch_lock_);
|
||||
std::lock_guard<std::mutex> lock(scratch_lock_);
|
||||
scratch_cache_.trim(false);
|
||||
}
|
||||
|
||||
void GpuAgent::InitAllocators() {
|
||||
for (auto pool : GetNearestCpuAgent()->regions()) {
|
||||
for (const auto& pool : GetNearestCpuAgent()->regions()) {
|
||||
if (pool->kernarg()) {
|
||||
system_allocator_ = [pool](size_t size, size_t alignment,
|
||||
const core::MemoryRegion* pool_ptr = pool.get();
|
||||
system_allocator_ = [pool_ptr](size_t size, size_t alignment,
|
||||
MemoryRegion::AllocateFlags alloc_flags) -> void* {
|
||||
assert(alignment <= 4096);
|
||||
void* ptr = nullptr;
|
||||
return (HSA_STATUS_SUCCESS ==
|
||||
core::Runtime::runtime_singleton_->AllocateMemory(pool, size, alloc_flags, &ptr))
|
||||
core::Runtime::runtime_singleton_->AllocateMemory(pool_ptr, size, alloc_flags, &ptr))
|
||||
? ptr
|
||||
: nullptr;
|
||||
};
|
||||
@@ -2513,14 +2514,14 @@ void GpuAgent::InitAllocators() {
|
||||
assert(system_allocator_ && "Nearest NUMA node did not have a kernarg pool.");
|
||||
|
||||
// Setup this GPU's fine-grain and coarse-grain allocators.
|
||||
for (auto region : regions()) {
|
||||
const AMD::MemoryRegion* amd_region = static_cast<const AMD::MemoryRegion*>(region);
|
||||
for (const auto& region : regions()) {
|
||||
const AMD::MemoryRegion* amd_region = static_cast<const AMD::MemoryRegion*>(region.get());
|
||||
|
||||
auto region_allocator = [region](size_t size,
|
||||
auto region_allocator = [amd_region](size_t size,
|
||||
MemoryRegion::AllocateFlags alloc_flags) -> void* {
|
||||
void* ptr = nullptr;
|
||||
return (HSA_STATUS_SUCCESS ==
|
||||
core::Runtime::runtime_singleton_->AllocateMemory(region, size, alloc_flags, &ptr))
|
||||
core::Runtime::runtime_singleton_->AllocateMemory(amd_region, size, alloc_flags, &ptr))
|
||||
? ptr
|
||||
: nullptr;
|
||||
};
|
||||
|
||||
@@ -283,18 +283,18 @@ const core::MemoryRegion* RegionMemory::AgentLocal(hsa_agent_t agent, bool is_co
|
||||
assert(amd_agent->device_type() == core::Agent::kAmdGpuDevice && "Invalid agent type.");
|
||||
auto agent_local_region =
|
||||
std::find_if(amd_agent->regions().begin(), amd_agent->regions().end(),
|
||||
[&](const core::MemoryRegion* region) {
|
||||
const AMD::MemoryRegion* amd_region = (const AMD::MemoryRegion*)region;
|
||||
[&](const std::shared_ptr<const core::MemoryRegion>& region) {
|
||||
const AMD::MemoryRegion* amd_region = (const AMD::MemoryRegion*)region.get();
|
||||
return amd_region->IsLocalMemory() && (!amd_region->fine_grain());
|
||||
});
|
||||
return agent_local_region == amd_agent->regions().end() ? nullptr : *agent_local_region;
|
||||
return agent_local_region == amd_agent->regions().end() ? nullptr : agent_local_region->get();
|
||||
}
|
||||
|
||||
const core::MemoryRegion* RegionMemory::System(bool is_code) {
|
||||
if (is_code)
|
||||
return core::Runtime::runtime_singleton_->system_regions_coarse()[0];
|
||||
return core::Runtime::runtime_singleton_->system_regions_coarse()[0].get();
|
||||
else
|
||||
return core::Runtime::runtime_singleton_->system_regions_fine()[0];
|
||||
return core::Runtime::runtime_singleton_->system_regions_fine()[0].get();
|
||||
}
|
||||
|
||||
bool RegionMemory::Allocate(size_t size, size_t align, bool zero) {
|
||||
|
||||
@@ -48,6 +48,8 @@
|
||||
#include "core/inc/amd_memory_region.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <mutex>
|
||||
#include <shared_mutex>
|
||||
|
||||
#include "core/inc/runtime.h"
|
||||
#include "core/inc/amd_cpu_agent.h"
|
||||
@@ -132,7 +134,7 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile,
|
||||
MemoryRegion::~MemoryRegion() {}
|
||||
|
||||
hsa_status_t MemoryRegion::Allocate(size_t& size, AllocateFlags alloc_flags, void** address, int agent_node_id) const {
|
||||
ScopedAcquire<KernelMutex> lock(&owner()->agent_memory_lock_);
|
||||
std::lock_guard<std::mutex> lock(owner()->agent_memory_lock_);
|
||||
return AllocateImpl(size, alloc_flags, address, agent_node_id);
|
||||
}
|
||||
|
||||
@@ -160,7 +162,7 @@ hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags,
|
||||
}
|
||||
|
||||
hsa_status_t MemoryRegion::Free(void* address, size_t size) const {
|
||||
ScopedAcquire<KernelMutex> lock(&owner()->agent_memory_lock_);
|
||||
std::lock_guard<std::mutex> lock(owner()->agent_memory_lock_);
|
||||
return FreeImpl(address, size);
|
||||
}
|
||||
|
||||
@@ -172,7 +174,7 @@ hsa_status_t MemoryRegion::FreeImpl(void* address, size_t size) const {
|
||||
|
||||
// TODO: Look into a better name and/or making this process transparent to exporting.
|
||||
hsa_status_t MemoryRegion::IPCFragmentExport(void* address) const {
|
||||
ScopedAcquire<KernelMutex> lock(&owner()->agent_memory_lock_);
|
||||
std::lock_guard<std::mutex> lock(owner()->agent_memory_lock_);
|
||||
if (!fragment_allocator_.discardBlock(address)) return HSA_STATUS_ERROR_INVALID_ALLOCATION;
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
@@ -448,7 +450,7 @@ hsa_status_t MemoryRegion::AllowAccess(uint32_t num_agents,
|
||||
std::vector<uint64_t> union_agents;
|
||||
info.size = sizeof(info);
|
||||
|
||||
ScopedAcquire<KernelMutex> lock(&access_lock_);
|
||||
std::lock_guard<std::mutex> lock(access_lock_);
|
||||
|
||||
if (core::Runtime::runtime_singleton_->PtrInfo(const_cast<void*>(ptr), &info, malloc,
|
||||
&agent_count, &accessible,
|
||||
@@ -512,8 +514,7 @@ hsa_status_t MemoryRegion::AllowAccess(uint32_t num_agents,
|
||||
|
||||
{ // Sequence with pointer info since queries to other fragments of the block may be adjusted by
|
||||
// this call.
|
||||
ScopedAcquire<KernelSharedMutex::Shared> lock(
|
||||
core::Runtime::runtime_singleton_->memory_lock_.shared());
|
||||
std::shared_lock<std::shared_mutex> lock(core::Runtime::runtime_singleton_->memory_lock_);
|
||||
uint64_t alternate_va = 0;
|
||||
if (owner()->driver().MakeMemoryResident(ptr, size, &alternate_va, &map_flag,
|
||||
whitelist_nodes.size(),
|
||||
|
||||
@@ -1804,7 +1804,7 @@ hsa_status_t hsa_code_object_serialize(
|
||||
IS_BAD_PTR(serialized_code_object);
|
||||
IS_BAD_PTR(serialized_code_object_size);
|
||||
|
||||
amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object);
|
||||
amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object).get();
|
||||
if (!code) {
|
||||
return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
|
||||
}
|
||||
@@ -1982,7 +1982,7 @@ hsa_status_t hsa_code_object_get_info(
|
||||
IS_OPEN();
|
||||
IS_BAD_PTR(value);
|
||||
|
||||
amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object);
|
||||
amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object).get();
|
||||
if (!code) {
|
||||
return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
|
||||
}
|
||||
@@ -2039,7 +2039,7 @@ hsa_status_t hsa_code_object_get_symbol(
|
||||
IS_BAD_PTR(symbol_name);
|
||||
IS_BAD_PTR(symbol);
|
||||
|
||||
amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object);
|
||||
amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object).get();
|
||||
if (!code) {
|
||||
return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
|
||||
}
|
||||
@@ -2059,7 +2059,7 @@ hsa_status_t hsa_code_object_get_symbol_from_name(
|
||||
IS_BAD_PTR(symbol_name);
|
||||
IS_BAD_PTR(symbol);
|
||||
|
||||
amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object);
|
||||
amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object).get();
|
||||
if (!code) {
|
||||
return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
|
||||
}
|
||||
@@ -2097,7 +2097,7 @@ hsa_status_t hsa_code_object_iterate_symbols(
|
||||
IS_OPEN();
|
||||
IS_BAD_PTR(callback);
|
||||
|
||||
amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object);
|
||||
amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object).get();
|
||||
if (!code) {
|
||||
return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
|
||||
}
|
||||
|
||||
@@ -759,7 +759,7 @@ hsa_status_t hsa_amd_memory_lock(void* host_ptr, size_t size,
|
||||
}
|
||||
|
||||
const AMD::MemoryRegion* system_region = static_cast<const AMD::MemoryRegion*>(
|
||||
core::Runtime::runtime_singleton_->system_regions_coarse()[0]);
|
||||
core::Runtime::runtime_singleton_->system_regions_coarse()[0].get());
|
||||
|
||||
return system_region->Lock(num_agent, agents, host_ptr, size, 0, agent_ptr);
|
||||
CATCH;
|
||||
@@ -799,7 +799,7 @@ hsa_status_t hsa_amd_memory_unlock(void* host_ptr) {
|
||||
|
||||
const AMD::MemoryRegion* system_region =
|
||||
reinterpret_cast<const AMD::MemoryRegion*>(
|
||||
core::Runtime::runtime_singleton_->system_regions_fine()[0]);
|
||||
core::Runtime::runtime_singleton_->system_regions_fine()[0].get());
|
||||
|
||||
return system_region->Unlock(host_ptr);
|
||||
CATCH;
|
||||
|
||||
@@ -340,7 +340,7 @@ void InterceptQueue::StoreRelaxed(hsa_signal_value_t value) {
|
||||
return;
|
||||
}
|
||||
|
||||
ScopedAcquire<KernelMutex> lock(&lock_);
|
||||
std::lock_guard<std::mutex> lock(lock_);
|
||||
|
||||
// Submit overflow packets.
|
||||
if (!overflow_.empty()) {
|
||||
|
||||
@@ -48,7 +48,7 @@ namespace rocr {
|
||||
namespace core {
|
||||
|
||||
HsaEvent* InterruptSignal::EventPool::alloc() {
|
||||
ScopedAcquire<HybridMutex> lock(&lock_);
|
||||
std::lock_guard<HybridMutex> lock(lock_);
|
||||
if (events_.empty()) {
|
||||
if (!allEventsAllocated) {
|
||||
HsaEvent* evt = InterruptSignal::CreateEvent(HSA_EVENTTYPE_SIGNAL, false);
|
||||
@@ -64,7 +64,7 @@ HsaEvent* InterruptSignal::EventPool::alloc() {
|
||||
|
||||
void InterruptSignal::EventPool::free(HsaEvent* evt) {
|
||||
if (evt == nullptr) return;
|
||||
ScopedAcquire<HybridMutex> lock(&lock_);
|
||||
std::lock_guard<HybridMutex> lock(lock_);
|
||||
events_.push_back(unique_event_ptr(evt));
|
||||
}
|
||||
|
||||
|
||||
@@ -50,7 +50,7 @@
|
||||
namespace rocr {
|
||||
namespace core {
|
||||
|
||||
KernelMutex IPCSignal::lock_;
|
||||
std::mutex IPCSignal::lock_;
|
||||
|
||||
SharedMemory::SharedMemory(const hsa_amd_ipc_memory_t* handle, size_t len) {
|
||||
hsa_status_t err = Runtime::runtime_singleton_->IPCAttach(handle, len, 0, NULL, &ptr_);
|
||||
@@ -85,7 +85,7 @@ Signal* IPCSignal::Attach(const hsa_amd_ipc_signal_t* ipc_signal_handle) {
|
||||
|
||||
hsa_signal_t handle = SharedSignal::Convert(shared.signal());
|
||||
|
||||
ScopedAcquire<KernelMutex> lock(&lock_);
|
||||
std::lock_guard<std::mutex> lock(lock_);
|
||||
Signal* ret = core::Signal::DuplicateHandle(handle);
|
||||
if (ret == nullptr) ret = new IPCSignal(std::move(shared));
|
||||
return ret;
|
||||
|
||||
@@ -48,6 +48,7 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <list>
|
||||
#include <shared_mutex>
|
||||
#if defined(__linux__)
|
||||
#include <link.h>
|
||||
#include <dlfcn.h>
|
||||
@@ -119,7 +120,7 @@ bool g_use_mwaitx;
|
||||
Runtime* Runtime::runtime_singleton_ = NULL;
|
||||
|
||||
hsa_status_t Runtime::Acquire() {
|
||||
ScopedAcquire<KernelMutex> boot(&bootstrap_lock());
|
||||
std::lock_guard<std::mutex> boot(bootstrap_lock());
|
||||
|
||||
if (runtime_singleton_ == NULL) {
|
||||
memset(log_flags, 0, sizeof(log_flags));
|
||||
@@ -146,7 +147,7 @@ hsa_status_t Runtime::Acquire() {
|
||||
}
|
||||
|
||||
hsa_status_t Runtime::Release() {
|
||||
ScopedAcquire<KernelMutex> boot(&bootstrap_lock());
|
||||
std::lock_guard<std::mutex> boot(bootstrap_lock());
|
||||
|
||||
if (runtime_singleton_ == nullptr) return HSA_STATUS_ERROR_NOT_INITIALIZED;
|
||||
|
||||
@@ -192,7 +193,7 @@ void Runtime::RegisterAgent(Agent* agent, bool Enabled) {
|
||||
agents_by_gpuid_[0] = agent;
|
||||
|
||||
// Add cpu regions to the system region list.
|
||||
for (const core::MemoryRegion* region : agent->regions()) {
|
||||
for (auto region : agent->regions()) {
|
||||
if (region->fine_grain()) {
|
||||
system_regions_fine_.push_back(region);
|
||||
} else {
|
||||
@@ -216,7 +217,7 @@ void Runtime::RegisterAgent(Agent* agent, bool Enabled) {
|
||||
assert(alignment <= 4096);
|
||||
void* ptr = NULL;
|
||||
return (HSA_STATUS_SUCCESS ==
|
||||
core::Runtime::runtime_singleton_->AllocateMemory(pool, size, alloc_flags,
|
||||
core::Runtime::runtime_singleton_->AllocateMemory(pool.get(), size, alloc_flags,
|
||||
&ptr, agent_node_id))
|
||||
? ptr
|
||||
: NULL;
|
||||
@@ -336,7 +337,7 @@ hsa_status_t Runtime::AllocateMemory(const MemoryRegion* region, size_t size,
|
||||
hsa_status_t status = region->Allocate(size, alloc_flags, address, agent_node_id);
|
||||
// Track the allocation result so that it could be freed properly.
|
||||
if (status == HSA_STATUS_SUCCESS) {
|
||||
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
|
||||
std::lock_guard<std::shared_mutex> lock(memory_lock_);
|
||||
allocation_map_[*address] = AllocationRegion(region, size, size_requested, alloc_flags);
|
||||
}
|
||||
|
||||
@@ -354,7 +355,7 @@ hsa_status_t Runtime::FreeMemory(void* ptr) {
|
||||
MemoryRegion::AllocateFlags alloc_flags = core::MemoryRegion::AllocateNoFlags;
|
||||
|
||||
{
|
||||
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
|
||||
std::lock_guard<std::shared_mutex> lock(memory_lock_);
|
||||
|
||||
std::map<const void*, AllocationRegion>::iterator it = allocation_map_.find(ptr);
|
||||
|
||||
@@ -458,7 +459,7 @@ hsa_status_t Runtime::FreeMemory(void* ptr) {
|
||||
|
||||
hsa_status_t Runtime::RegisterReleaseNotifier(void* ptr, hsa_amd_deallocation_callback_t callback,
|
||||
void* user_data) {
|
||||
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
|
||||
std::lock_guard<std::shared_mutex> lock(memory_lock_);
|
||||
auto mem = allocation_map_.upper_bound(ptr);
|
||||
if (mem != allocation_map_.begin()) {
|
||||
mem--;
|
||||
@@ -482,7 +483,7 @@ hsa_status_t Runtime::RegisterReleaseNotifier(void* ptr, hsa_amd_deallocation_ca
|
||||
hsa_status_t Runtime::DeregisterReleaseNotifier(void* ptr,
|
||||
hsa_amd_deallocation_callback_t callback) {
|
||||
hsa_status_t ret = HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
|
||||
std::lock_guard<std::shared_mutex> lock(memory_lock_);
|
||||
auto mem = allocation_map_.upper_bound(ptr);
|
||||
if (mem != allocation_map_.begin()) {
|
||||
mem--;
|
||||
@@ -552,7 +553,7 @@ hsa_status_t Runtime::CopyMemory(void* dst, const void* src, size_t size) {
|
||||
// GPU-CPU
|
||||
// Must ensure that system memory is visible to the GPU during the copy.
|
||||
const AMD::MemoryRegion* system_region =
|
||||
static_cast<const AMD::MemoryRegion*>(system_regions_fine_[0]);
|
||||
static_cast<const AMD::MemoryRegion*>(system_regions_fine_[0].get());
|
||||
|
||||
void* gpuPtr = nullptr;
|
||||
const auto& locked_copy = [&](void*& ptr, core::Agent* locking_agent) {
|
||||
@@ -698,7 +699,7 @@ hsa_status_t Runtime::AllowAccess(uint32_t num_agents,
|
||||
size_t alloc_size = 0;
|
||||
|
||||
{
|
||||
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
|
||||
std::lock_guard<std::shared_mutex> lock(memory_lock_);
|
||||
|
||||
std::map<const void*, AllocationRegion>::const_iterator it = allocation_map_.find(ptr);
|
||||
|
||||
@@ -929,7 +930,7 @@ hsa_status_t Runtime::InteropMap(uint32_t num_agents, Agent** agents,
|
||||
*size = info.SizeInBytes;
|
||||
*ptr = info.MemoryAddress;
|
||||
|
||||
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
|
||||
std::lock_guard<std::shared_mutex> lock(memory_lock_);
|
||||
allocation_map_[info.MemoryAddress] = AllocationRegion(
|
||||
nullptr, info.SizeInBytes, info.SizeInBytes, core::MemoryRegion::AllocateNoFlags);
|
||||
|
||||
@@ -1055,7 +1056,7 @@ hsa_status_t Runtime::PtrInfo(const void* ptr, hsa_amd_pointer_info_t* info, voi
|
||||
|
||||
{ // memory_lock protects access to the NMappedNodes array and fragment user data since these may
|
||||
// change with calls to memory APIs.
|
||||
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
|
||||
std::lock_guard<std::shared_mutex> lock(memory_lock_);
|
||||
|
||||
if (VMemoryPtrInfo(ptr, &retInfo, alloc, num_agents_accessible, accessible) ==
|
||||
HSA_STATUS_SUCCESS) {
|
||||
@@ -1196,7 +1197,7 @@ hsa_status_t Runtime::PtrInfo(const void* ptr, hsa_amd_pointer_info_t* info, voi
|
||||
|
||||
hsa_status_t Runtime::SetPtrInfoData(const void* ptr, void* userptr) {
|
||||
{ // Use allocation map if possible to handle fragments.
|
||||
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
|
||||
std::lock_guard<std::shared_mutex> lock(memory_lock_);
|
||||
const auto& it = allocation_map_.find(ptr);
|
||||
if (it != allocation_map_.end()) {
|
||||
it->second.user_ptr = userptr;
|
||||
@@ -1307,7 +1308,7 @@ void Runtime::AsyncIPCSockServerConnLoop(void*) {
|
||||
size_t len = 0;
|
||||
|
||||
// Search for registered export pointer
|
||||
ScopedAcquire<KernelMutex> lock(&ipc_sock_server_lock_);
|
||||
std::lock_guard<std::mutex> lock(ipc_sock_server_lock_);
|
||||
for (auto& conns : ipc_sock_server_conns_) {
|
||||
if (conn_handle == conns.first) {
|
||||
ptr = reinterpret_cast<void *>(conn_handle);
|
||||
@@ -1372,7 +1373,7 @@ hsa_status_t Runtime::IPCCreate(void* ptr, size_t len, hsa_amd_ipc_memory_t* han
|
||||
if (useFrag) {
|
||||
handle->handle[6] |= 0x80000000 | fragOffset;
|
||||
// Prevent realloction of fragment for better performance.
|
||||
ScopedAcquire<KernelSharedMutex::Shared> lock(memory_lock_.shared());
|
||||
std::shared_lock<std::shared_mutex> lock(memory_lock_);
|
||||
err = allocation_map_[ptr].region->IPCFragmentExport(ptr);
|
||||
assert(err == HSA_STATUS_SUCCESS && "Region inconsistent with address map.");
|
||||
}
|
||||
@@ -1439,7 +1440,7 @@ hsa_status_t Runtime::IPCCreate(void* ptr, size_t len, hsa_amd_ipc_memory_t* han
|
||||
|
||||
close(dmabuf_fd);
|
||||
|
||||
ScopedAcquire<KernelMutex> lock(&ipc_sock_server_lock_);
|
||||
std::lock_guard<std::mutex> lock(ipc_sock_server_lock_);
|
||||
#if defined(__linux__)
|
||||
if (!ipc_sock_server_conns_.size()) { // create new runtime socket server
|
||||
struct sockaddr_un address;
|
||||
@@ -1549,7 +1550,7 @@ int Runtime::IPCClientImport(uint32_t conn_handle, uint64_t dmabuf_fd_handle,
|
||||
|
||||
// Store the buffer object handle in allocation map for later use
|
||||
if (err == HSAKMT_STATUS_SUCCESS) {
|
||||
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
|
||||
std::lock_guard<std::shared_mutex> lock(memory_lock_);
|
||||
allocation_map_[*importAddress] =
|
||||
AllocationRegion(nullptr, *importSize, *importSize, core::MemoryRegion::AllocateNoFlags);
|
||||
allocation_map_[*importAddress].ldrm_bo = res.buf_handle;
|
||||
@@ -1579,7 +1580,7 @@ hsa_status_t Runtime::IPCAttach(const hsa_amd_ipc_memory_t* handle, size_t len,
|
||||
importAddress = reinterpret_cast<uint8_t*>(importAddress) + fragOffset;
|
||||
len = Min(len, importSize - fragOffset);
|
||||
}
|
||||
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
|
||||
std::lock_guard<std::shared_mutex> lock(memory_lock_);
|
||||
allocation_map_[importAddress] =
|
||||
AllocationRegion(nullptr, len, len, core::MemoryRegion::AllocateNoFlags);
|
||||
allocation_map_[importAddress].ldrm_bo = ldrm_bo;
|
||||
@@ -1711,7 +1712,7 @@ hsa_status_t Runtime::IPCAttach(const hsa_amd_ipc_memory_t* handle, size_t len,
|
||||
hsa_status_t Runtime::IPCDetach(void* ptr) {
|
||||
bool ldrmImportCleaned = false;
|
||||
{ // Handle imported fragments.
|
||||
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
|
||||
std::unique_lock<std::shared_mutex> lock(memory_lock_);
|
||||
const auto& it = allocation_map_.find(ptr);
|
||||
if (it != allocation_map_.end()) {
|
||||
if (it->second.region != nullptr) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
@@ -1728,7 +1729,7 @@ hsa_status_t Runtime::IPCDetach(void* ptr) {
|
||||
assert(!"Unimplemented!");
|
||||
#endif
|
||||
allocation_map_.erase(it);
|
||||
lock.Release(); // Can't hold memory lock when using pointer info.
|
||||
lock.unlock(); // Can't hold memory lock when using pointer info.
|
||||
|
||||
PtrInfoBlockData block = {};
|
||||
hsa_amd_pointer_info_t info = {};
|
||||
@@ -1954,7 +1955,7 @@ void Runtime::AsyncEventsPool::clear() {
|
||||
}
|
||||
|
||||
Runtime::AsyncEventItem* Runtime::AsyncEventsPool::alloc() {
|
||||
ScopedAcquire<HybridMutex> lock(&lock_);
|
||||
std::lock_guard<HybridMutex> lock(lock_);
|
||||
if (free_list_.empty()) {
|
||||
AsyncEventItem* block = reinterpret_cast<AsyncEventItem*>(
|
||||
allocate_()(block_size_ * sizeof(AsyncEventItem), __alignof(AsyncEventItem), core::MemoryRegion::AllocateNonPaged, 0));
|
||||
@@ -1985,7 +1986,7 @@ void Runtime::AsyncEventsPool::free(AsyncEventItem* ptr) {
|
||||
if (ptr == nullptr) return;
|
||||
|
||||
ptr->~AsyncEventItem();
|
||||
ScopedAcquire<HybridMutex> lock(&lock_);
|
||||
std::lock_guard<HybridMutex> lock(lock_);
|
||||
|
||||
ifdebug {
|
||||
bool valid = false;
|
||||
@@ -2059,33 +2060,33 @@ void Runtime::BindErrorHandlers() {
|
||||
|
||||
// Create memory event with manual reset to avoid racing condition
|
||||
// with driver in case of multiple concurrent VM faults.
|
||||
vm_fault_event_ = core::InterruptSignal::CreateEvent(HSA_EVENTTYPE_MEMORY, true);
|
||||
vm_fault_event_.reset(core::InterruptSignal::CreateEvent(HSA_EVENTTYPE_MEMORY, true));
|
||||
|
||||
// Create an interrupt signal object to contain the memory event.
|
||||
// This signal object will be registered with the async handler global
|
||||
// thread.
|
||||
vm_fault_signal_ = new core::InterruptSignal(0, vm_fault_event_);
|
||||
vm_fault_signal_.reset(new core::InterruptSignal(0, vm_fault_event_.get()));
|
||||
|
||||
if (!vm_fault_signal_->IsValid() || vm_fault_signal_->EopEvent() == NULL) {
|
||||
assert(false && "Failed on creating VM fault signal");
|
||||
return;
|
||||
}
|
||||
|
||||
SetAsyncSignalHandler(core::Signal::Convert(vm_fault_signal_), HSA_SIGNAL_CONDITION_NE, 0,
|
||||
VMFaultHandler, reinterpret_cast<void*>(vm_fault_signal_));
|
||||
SetAsyncSignalHandler(core::Signal::Convert(vm_fault_signal_.get()), HSA_SIGNAL_CONDITION_NE, 0,
|
||||
VMFaultHandler, reinterpret_cast<void*>(vm_fault_signal_.get()));
|
||||
|
||||
// Create HW exception event which is for Non-RAS events
|
||||
hw_exception_event_ = core::InterruptSignal::CreateEvent(HSA_EVENTTYPE_HW_EXCEPTION, true);
|
||||
hw_exception_event_.reset(core::InterruptSignal::CreateEvent(HSA_EVENTTYPE_HW_EXCEPTION, true));
|
||||
|
||||
hw_exception_signal_ = new core::InterruptSignal(0, hw_exception_event_);
|
||||
hw_exception_signal_.reset(new core::InterruptSignal(0, hw_exception_event_.get()));
|
||||
|
||||
if (!hw_exception_signal_->IsValid() || hw_exception_signal_->EopEvent() == NULL) {
|
||||
assert(false && "Failed on creating HW Exception signal");
|
||||
return;
|
||||
}
|
||||
|
||||
SetAsyncSignalHandler(core::Signal::Convert(hw_exception_signal_), HSA_SIGNAL_CONDITION_NE, 0,
|
||||
HwExceptionHandler, reinterpret_cast<void*>(hw_exception_signal_));
|
||||
SetAsyncSignalHandler(core::Signal::Convert(hw_exception_signal_.get()), HSA_SIGNAL_CONDITION_NE, 0,
|
||||
HwExceptionHandler, reinterpret_cast<void*>(hw_exception_signal_.get()));
|
||||
}
|
||||
|
||||
bool Runtime::HwExceptionHandler(hsa_signal_value_t val, void* arg) {
|
||||
@@ -2262,7 +2263,8 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) {
|
||||
}
|
||||
|
||||
void Runtime::PrintMemoryMapNear(void* ptr) {
|
||||
runtime_singleton_->memory_lock_.Acquire();
|
||||
std::unique_lock<std::shared_mutex> lock(runtime_singleton_->memory_lock_);
|
||||
|
||||
auto it = runtime_singleton_->allocation_map_.upper_bound(ptr);
|
||||
for (int i = 0; i < 2; i++) {
|
||||
if (it != runtime_singleton_->allocation_map_.begin()) it--;
|
||||
@@ -2287,8 +2289,9 @@ void Runtime::PrintMemoryMapNear(void* ptr) {
|
||||
it++;
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
it = start;
|
||||
runtime_singleton_->memory_lock_.Release();
|
||||
it = start;
|
||||
lock.unlock();
|
||||
|
||||
hsa_amd_pointer_info_t info = {};
|
||||
PtrInfoBlockData block = {};
|
||||
uint32_t count = 0;
|
||||
@@ -2408,7 +2411,7 @@ hsa_status_t Runtime::Load() {
|
||||
|
||||
BindErrorHandlers();
|
||||
|
||||
loader_ = amd::hsa::loader::Loader::Create(&loader_context_);
|
||||
loader_.reset(amd::hsa::loader::Loader::Create(&loader_context_));
|
||||
|
||||
// Load extensions
|
||||
LoadExtensions();
|
||||
@@ -2449,8 +2452,8 @@ void Runtime::Unload() {
|
||||
UnloadTools();
|
||||
UnloadExtensions();
|
||||
|
||||
amd::hsa::loader::Loader::Destroy(loader_);
|
||||
loader_ = nullptr;
|
||||
amd::hsa::loader::Loader::Destroy(loader_.get());
|
||||
loader_.reset();
|
||||
|
||||
for(auto nodeAgent: agents_by_node_) {
|
||||
for (auto agent: nodeAgent.second)
|
||||
@@ -2462,17 +2465,17 @@ void Runtime::Unload() {
|
||||
|
||||
if (vm_fault_signal_ != nullptr) {
|
||||
vm_fault_signal_->DestroySignal();
|
||||
vm_fault_signal_ = nullptr;
|
||||
vm_fault_signal_.reset();
|
||||
}
|
||||
core::InterruptSignal::DestroyEvent(vm_fault_event_);
|
||||
vm_fault_event_ = nullptr;
|
||||
|
||||
vm_fault_event_.reset();
|
||||
|
||||
if (hw_exception_signal_ != nullptr) {
|
||||
hw_exception_signal_->DestroySignal();
|
||||
hw_exception_signal_ = nullptr;
|
||||
hw_exception_signal_.reset();
|
||||
}
|
||||
core::InterruptSignal::DestroyEvent(hw_exception_event_);
|
||||
hw_exception_event_ = nullptr;
|
||||
|
||||
hw_exception_event_.reset();
|
||||
|
||||
SharedSignalPool.clear();
|
||||
|
||||
@@ -2890,7 +2893,7 @@ void Runtime::AsyncEvents::Clear() {
|
||||
|
||||
hsa_status_t Runtime::SetCustomSystemEventHandler(hsa_amd_system_event_callback_t callback,
|
||||
void* data) {
|
||||
ScopedAcquire<KernelMutex> lock(&system_event_lock_);
|
||||
std::lock_guard<std::mutex> lock(system_event_lock_);
|
||||
system_event_handlers_.push_back(
|
||||
std::make_pair(AMD::callback_t<hsa_amd_system_event_callback_t>(callback), data));
|
||||
return HSA_STATUS_SUCCESS;
|
||||
@@ -2898,7 +2901,7 @@ hsa_status_t Runtime::SetCustomSystemEventHandler(hsa_amd_system_event_callback_
|
||||
|
||||
std::vector<std::pair<AMD::callback_t<hsa_amd_system_event_callback_t>, void*>>
|
||||
Runtime::GetSystemEventHandlers() {
|
||||
ScopedAcquire<KernelMutex> lock(&system_event_lock_);
|
||||
std::lock_guard<std::mutex> lock(system_event_lock_);
|
||||
return system_event_handlers_;
|
||||
}
|
||||
|
||||
@@ -3269,7 +3272,7 @@ hsa_status_t Runtime::SvmPrefetch(void* ptr, size_t size, hsa_agent_t agent,
|
||||
}
|
||||
|
||||
{
|
||||
ScopedAcquire<KernelMutex> lock(&prefetch_lock_);
|
||||
std::lock_guard<std::mutex> lock(prefetch_lock_);
|
||||
// Remove all fully overlapped and trim partially overlapped ranges.
|
||||
// Get iteration bounds
|
||||
auto start = prefetch_map_.upper_bound(base);
|
||||
@@ -3332,7 +3335,7 @@ hsa_status_t Runtime::SvmPrefetch(void* ptr, size_t size, hsa_agent_t agent,
|
||||
|
||||
// Remove the prefetch's ranges from the map.
|
||||
static auto removePrefetchRanges = [](PrefetchOp* op) {
|
||||
ScopedAcquire<KernelMutex> lock(&Runtime::runtime_singleton_->prefetch_lock_);
|
||||
std::lock_guard<std::mutex> lock(Runtime::runtime_singleton_->prefetch_lock_);
|
||||
auto it = op->prefetch_map_entry;
|
||||
while (it != Runtime::runtime_singleton_->prefetch_map_.end()) {
|
||||
auto next = it->second.next;
|
||||
@@ -3389,7 +3392,7 @@ Agent* Runtime::GetSVMPrefetchAgent(void* ptr, size_t size) {
|
||||
|
||||
std::vector<std::pair<uintptr_t, uintptr_t>> holes;
|
||||
|
||||
ScopedAcquire<KernelMutex> lock(&Runtime::runtime_singleton_->prefetch_lock_);
|
||||
std::lock_guard<std::mutex> lock(Runtime::runtime_singleton_->prefetch_lock_);
|
||||
auto start = prefetch_map_.upper_bound(base);
|
||||
if (start != prefetch_map_.begin()) start--;
|
||||
auto stop = prefetch_map_.lower_bound(end);
|
||||
@@ -3441,7 +3444,7 @@ Agent* Runtime::GetSVMPrefetchAgent(void* ptr, size_t size) {
|
||||
hsa_status_t Runtime::DmaBufExport(const void* ptr, size_t size, int* dmabuf, uint64_t* offset,
|
||||
uint64_t flags) {
|
||||
#ifdef __linux__
|
||||
ScopedAcquire<KernelSharedMutex::Shared> lock(memory_lock_.shared());
|
||||
std::shared_lock<std::shared_mutex> lock(memory_lock_);
|
||||
// Lookup containing allocation.
|
||||
auto mem = allocation_map_.upper_bound(ptr);
|
||||
if (mem != allocation_map_.begin()) {
|
||||
@@ -3507,7 +3510,7 @@ hsa_status_t Runtime::VMemoryAddressReserve(void** va, size_t size, uint64_t add
|
||||
|
||||
if (!alignment) alignment = rocr::os::PageSize();
|
||||
|
||||
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
|
||||
std::lock_guard<std::shared_mutex> lock(memory_lock_);
|
||||
|
||||
if (flags & HSA_AMD_VMEM_ADDRESS_NO_REGISTER) {
|
||||
size_t requested = size + alignment - rocr::os::PageSize();
|
||||
@@ -3548,7 +3551,7 @@ hsa_status_t Runtime::VMemoryAddressReserve(void** va, size_t size, uint64_t add
|
||||
}
|
||||
|
||||
hsa_status_t Runtime::VMemoryAddressFree(void* va, size_t size) {
|
||||
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
|
||||
std::lock_guard<std::shared_mutex> lock(memory_lock_);
|
||||
std::map<const void*, AddressHandle>::iterator it = reserved_address_map_.find(va);
|
||||
|
||||
if (it == reserved_address_map_.end()) {
|
||||
@@ -3580,7 +3583,7 @@ hsa_status_t Runtime::VMemoryHandleCreate(const MemoryRegion* region, size_t siz
|
||||
if (!IsMultipleOf(size, memRegion->GetPageSize()))
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
|
||||
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
|
||||
std::lock_guard<std::shared_mutex> lock(memory_lock_);
|
||||
ThunkHandle user_mode_driver_handle;
|
||||
hsa_status_t status =
|
||||
region->Allocate(size, alloc_flags, &user_mode_driver_handle, 0);
|
||||
@@ -3597,7 +3600,7 @@ hsa_status_t Runtime::VMemoryHandleCreate(const MemoryRegion* region, size_t siz
|
||||
}
|
||||
|
||||
hsa_status_t Runtime::VMemoryHandleRelease(hsa_amd_vmem_alloc_handle_t memoryOnlyHandle) {
|
||||
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
|
||||
std::lock_guard<std::shared_mutex> lock(memory_lock_);
|
||||
auto memoryHandleIt = memory_handle_map_.find(MemoryHandle::Convert(memoryOnlyHandle));
|
||||
|
||||
if (memoryHandleIt == memory_handle_map_.end()) {
|
||||
@@ -3628,7 +3631,7 @@ hsa_status_t Runtime::VMemoryHandleMap(void* va, size_t size, size_t in_offset,
|
||||
uint64_t offset = 0, ret;
|
||||
uint64_t drm_cpu_addr = 0;
|
||||
|
||||
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
|
||||
std::lock_guard<std::shared_mutex> lock(memory_lock_);
|
||||
auto addressHandle = VMemoryFindReservedAddressHandle(va);
|
||||
if (addressHandle == nullptr ||
|
||||
reinterpret_cast<uint8_t*>(va) + size >
|
||||
@@ -3703,7 +3706,7 @@ hsa_status_t Runtime::VMemoryHandleMap(void* va, size_t size, size_t in_offset,
|
||||
}
|
||||
|
||||
hsa_status_t Runtime::VMemoryHandleUnmap(void* va, size_t size) {
|
||||
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
|
||||
std::lock_guard<std::shared_mutex> lock(memory_lock_);
|
||||
std::list<std::pair<void*, MappedHandle*>> mappedHandles;
|
||||
|
||||
// va + size may consist of multiple MappedHandle's.
|
||||
@@ -3921,7 +3924,7 @@ hsa_status_t Runtime::VMemorySetAccess(void* va, size_t size,
|
||||
if (targetAgent == NULL || !targetAgent->IsValid()) return HSA_STATUS_ERROR_INVALID_AGENT;
|
||||
}
|
||||
|
||||
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
|
||||
std::lock_guard<std::shared_mutex> lock(memory_lock_);
|
||||
|
||||
auto addressHandle = VMemoryFindReservedAddressHandle(va);
|
||||
if (addressHandle == nullptr ||
|
||||
@@ -4014,7 +4017,7 @@ hsa_status_t Runtime::VMemoryGetAccess(const void* va, hsa_access_permission_t*
|
||||
*perms = HSA_ACCESS_PERMISSION_NONE;
|
||||
bool mappedHandleFound = false;
|
||||
|
||||
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
|
||||
std::lock_guard<std::shared_mutex> lock(memory_lock_);
|
||||
|
||||
auto mappedHandleIt = mapped_handle_map_.upper_bound(va);
|
||||
if (mappedHandleIt != mapped_handle_map_.begin()) {
|
||||
@@ -4076,8 +4079,8 @@ hsa_status_t Runtime::VMemoryImportShareableHandle(int dmabuf_fd,
|
||||
return;
|
||||
}
|
||||
|
||||
for (const core::MemoryRegion* region : agent->regions()) {
|
||||
const AMD::MemoryRegion* amd_region = reinterpret_cast<const AMD::MemoryRegion*>(region);
|
||||
for (const auto& region : agent->regions()) {
|
||||
const AMD::MemoryRegion* amd_region = reinterpret_cast<const AMD::MemoryRegion*>(region.get());
|
||||
|
||||
// TODO: Verify that this works on a system with FINE_GRAINED memory.
|
||||
// System's with FINE_GRAINED will have both COARSE and FINE grain... need to get the
|
||||
|
||||
@@ -58,7 +58,7 @@
|
||||
namespace rocr {
|
||||
namespace core {
|
||||
|
||||
KernelMutex Signal::ipcLock_;
|
||||
std::mutex Signal::ipcLock_;
|
||||
std::map<decltype(hsa_signal_t::handle), Signal*> Signal::ipcMap_;
|
||||
|
||||
void SharedSignalPool_t::clear() {
|
||||
@@ -76,7 +76,7 @@ void SharedSignalPool_t::clear() {
|
||||
}
|
||||
|
||||
SharedSignal* SharedSignalPool_t::alloc() {
|
||||
ScopedAcquire<HybridMutex> lock(&lock_);
|
||||
std::lock_guard<HybridMutex> lock(lock_);
|
||||
if (free_list_.empty()) {
|
||||
SharedSignal* block = reinterpret_cast<SharedSignal*>(
|
||||
allocate_()(block_size_ * sizeof(SharedSignal), __alignof(SharedSignal), core::MemoryRegion::AllocateNonPaged, 0));
|
||||
@@ -109,7 +109,7 @@ void SharedSignalPool_t::free(SharedSignal* ptr) {
|
||||
if (ptr == nullptr) return;
|
||||
|
||||
ptr->~SharedSignal();
|
||||
ScopedAcquire<HybridMutex> lock(&lock_);
|
||||
std::lock_guard<HybridMutex> lock(lock_);
|
||||
|
||||
ifdebug {
|
||||
bool valid = false;
|
||||
@@ -134,7 +134,7 @@ LocalSignal::LocalSignal(hsa_signal_value_t initial_value, bool exportable)
|
||||
}
|
||||
|
||||
void Signal::registerIpc() {
|
||||
ScopedAcquire<KernelMutex> lock(&ipcLock_);
|
||||
std::lock_guard<std::mutex> lock(ipcLock_);
|
||||
auto handle = Convert(this);
|
||||
assert(ipcMap_.find(handle.handle) == ipcMap_.end() &&
|
||||
"Can't register the same IPC signal twice.");
|
||||
@@ -142,7 +142,7 @@ void Signal::registerIpc() {
|
||||
}
|
||||
|
||||
bool Signal::deregisterIpc() {
|
||||
ScopedAcquire<KernelMutex> lock(&ipcLock_);
|
||||
std::lock_guard<std::mutex> lock(ipcLock_);
|
||||
if (refcount_ != 0) return false;
|
||||
auto handle = Convert(this);
|
||||
const auto& it = ipcMap_.find(handle.handle);
|
||||
@@ -152,14 +152,14 @@ bool Signal::deregisterIpc() {
|
||||
}
|
||||
|
||||
Signal* Signal::lookupIpc(hsa_signal_t signal) {
|
||||
ScopedAcquire<KernelMutex> lock(&ipcLock_);
|
||||
std::lock_guard<std::mutex> lock(ipcLock_);
|
||||
const auto& it = ipcMap_.find(signal.handle);
|
||||
if (it == ipcMap_.end()) return nullptr;
|
||||
return it->second;
|
||||
}
|
||||
|
||||
Signal* Signal::duplicateIpc(hsa_signal_t signal) {
|
||||
ScopedAcquire<KernelMutex> lock(&ipcLock_);
|
||||
std::lock_guard<std::mutex> lock(ipcLock_);
|
||||
const auto& it = ipcMap_.find(signal.handle);
|
||||
if (it == ipcMap_.end()) return nullptr;
|
||||
it->second->refcount_++;
|
||||
|
||||
@@ -125,16 +125,16 @@ template <typename T> class lazy_ptr {
|
||||
private:
|
||||
mutable std::unique_ptr<T> obj;
|
||||
mutable std::function<T*(void)> func;
|
||||
mutable KernelMutex lock;
|
||||
mutable std::mutex lock;
|
||||
|
||||
// Separated from make to improve inlining.
|
||||
void make_body(bool block) const {
|
||||
if (block) {
|
||||
lock.Acquire();
|
||||
} else if (!lock.Try()) {
|
||||
lock.lock();
|
||||
} else if (!lock.try_lock()) {
|
||||
return;
|
||||
}
|
||||
MAKE_SCOPE_GUARD([&]() { lock.Release(); });
|
||||
MAKE_SCOPE_GUARD([&]() { lock.unlock(); });
|
||||
if (func == nullptr) return;
|
||||
T* ptr = func();
|
||||
obj.reset(ptr);
|
||||
|
||||
@@ -90,6 +90,11 @@ class HybridMutex {
|
||||
os::PostSemaphore(sem_);
|
||||
}
|
||||
|
||||
// To add compatibility with std::lock_guard
|
||||
void lock() { Acquire(); }
|
||||
void unlock() { Release(); }
|
||||
bool try_lock() { return Try(); }
|
||||
|
||||
private:
|
||||
std::atomic<int> lock_;
|
||||
os::Semaphore sem_;
|
||||
@@ -100,27 +105,6 @@ class HybridMutex {
|
||||
DISALLOW_COPY_AND_ASSIGN(HybridMutex);
|
||||
};
|
||||
|
||||
|
||||
/// @brief: a class represents a kernel mutex.
|
||||
/// Uses the kernel's scheduler to keep the waiting thread from being scheduled
|
||||
/// until the lock is released (Best for long waits, though anything using
|
||||
/// a kernel object is a long wait).
|
||||
class KernelMutex {
|
||||
public:
|
||||
KernelMutex() { lock_ = os::CreateMutex(); }
|
||||
~KernelMutex() { os::DestroyMutex(lock_); }
|
||||
|
||||
bool Try() { return os::TryAcquireMutex(lock_); }
|
||||
bool Acquire() { return os::AcquireMutex(lock_); }
|
||||
void Release() { os::ReleaseMutex(lock_); }
|
||||
|
||||
private:
|
||||
os::Mutex lock_;
|
||||
|
||||
/// @brief: Disable copiable and assignable ability.
|
||||
DISALLOW_COPY_AND_ASSIGN(KernelMutex);
|
||||
};
|
||||
|
||||
/// @brief: represents a spin lock.
|
||||
/// For very short hold durations on the order of the thread scheduling
|
||||
/// quanta or less.
|
||||
@@ -143,6 +127,11 @@ class SpinMutex {
|
||||
}
|
||||
void Release() { lock_ = 0; }
|
||||
|
||||
// To add compatibility with std::lock_guard
|
||||
void lock() { Acquire(); }
|
||||
void unlock() { Release(); }
|
||||
bool try_lock() { return Try(); }
|
||||
|
||||
private:
|
||||
std::atomic<int> lock_;
|
||||
|
||||
@@ -167,124 +156,6 @@ class KernelEvent {
|
||||
DISALLOW_COPY_AND_ASSIGN(KernelEvent);
|
||||
};
|
||||
|
||||
/// @brief: represents a yielding shared mutex.
|
||||
/// aka read/write mutex
|
||||
class KernelSharedMutex {
|
||||
public:
|
||||
/// @brief: Interfaces ScopedAcquire to shared operations.
|
||||
class Shared {
|
||||
public:
|
||||
explicit Shared(KernelSharedMutex* lock) : lock_(lock) {}
|
||||
bool Try() { return lock_->TryShared(); }
|
||||
bool Acquire() { return lock_->AcquireShared(); }
|
||||
void Release() { lock_->ReleaseShared(); }
|
||||
|
||||
private:
|
||||
KernelSharedMutex* lock_;
|
||||
};
|
||||
|
||||
KernelSharedMutex() { lock_ = os::CreateSharedMutex(); }
|
||||
~KernelSharedMutex() { os::DestroySharedMutex(lock_); }
|
||||
|
||||
// Exclusive mode operations
|
||||
bool Try() { return os::TryAcquireSharedMutex(lock_); }
|
||||
bool Acquire() { return os::AcquireSharedMutex(lock_); }
|
||||
void Release() { os::ReleaseSharedMutex(lock_); }
|
||||
|
||||
// Shared mode operations
|
||||
bool TryShared() { return os::TrySharedAcquireSharedMutex(lock_); }
|
||||
bool AcquireShared() { return os::SharedAcquireSharedMutex(lock_); }
|
||||
void ReleaseShared() { os::SharedReleaseSharedMutex(lock_); }
|
||||
|
||||
// Return shared operations interface
|
||||
Shared shared() { return Shared(this); }
|
||||
|
||||
private:
|
||||
os::SharedMutex lock_;
|
||||
|
||||
/// @brief: Disable copiable and assignable ability.
|
||||
DISALLOW_COPY_AND_ASSIGN(KernelSharedMutex);
|
||||
};
|
||||
|
||||
/// @brief: Type trait to identify mutex types
|
||||
template <class T> class isMutex {
|
||||
public:
|
||||
enum { value = false };
|
||||
};
|
||||
template <> class isMutex<HybridMutex> {
|
||||
public:
|
||||
enum { value = true };
|
||||
};
|
||||
template <> class isMutex<KernelMutex> {
|
||||
public:
|
||||
enum { value = true };
|
||||
};
|
||||
template <> class isMutex<SpinMutex> {
|
||||
public:
|
||||
enum { value = true };
|
||||
};
|
||||
template <> class isMutex<KernelSharedMutex> {
|
||||
public:
|
||||
enum { value = true };
|
||||
};
|
||||
|
||||
/// @brief: A class behaves as a lock in a scope. When trying to enter into the
|
||||
/// critical section, creat a object of this class. After the control path goes
|
||||
/// out of the scope, it will release the lock automatically.
|
||||
template <class LockType> class ScopedAcquire {
|
||||
public:
|
||||
/// @brief: When constructing, acquire the lock.
|
||||
/// @param: lock(Input), pointer to an existing lock.
|
||||
explicit ScopedAcquire(LockType* lock) : lock_(lock), doRelease(true) {
|
||||
static_assert(isMutex<LockType>::value, "ScopedAcquire requires a mutex type.");
|
||||
lock_.Acquire();
|
||||
}
|
||||
explicit ScopedAcquire(LockType lock) : lock_(lock), doRelease(true) {
|
||||
static_assert(!isMutex<LockType>::value, "Mutex types are not copyable.");
|
||||
lock_.Acquire();
|
||||
}
|
||||
|
||||
/// @brief: when destructing, release the lock.
|
||||
~ScopedAcquire() {
|
||||
if (doRelease) lock_.Release();
|
||||
}
|
||||
|
||||
/// @brief: Release the lock early. Avoid using when possible.
|
||||
void Release() {
|
||||
lock_.Release();
|
||||
doRelease = false;
|
||||
}
|
||||
|
||||
private:
|
||||
/// @brief: Adapts between pointers to mutex types and mutex pointer types.
|
||||
template <class T, bool B> class container {
|
||||
public:
|
||||
container(T* lock) : lock_(lock) {}
|
||||
__forceinline bool Acquire() { return lock_->Acquire(); }
|
||||
__forceinline void Release() { return lock_->Release(); }
|
||||
|
||||
private:
|
||||
T* lock_;
|
||||
};
|
||||
|
||||
/// @brief: Specialization for mutex pointer types.
|
||||
template <class T> class container<T, false> {
|
||||
public:
|
||||
container(T lock) : lock_(lock) {}
|
||||
__forceinline bool Acquire() { return lock_.Acquire(); }
|
||||
__forceinline void Release() { return lock_.Release(); }
|
||||
|
||||
private:
|
||||
T lock_;
|
||||
};
|
||||
|
||||
container<LockType, isMutex<LockType>::value> lock_;
|
||||
bool doRelease;
|
||||
|
||||
/// @brief: Disable copiable and assignable ability.
|
||||
DISALLOW_COPY_AND_ASSIGN(ScopedAcquire);
|
||||
};
|
||||
|
||||
} // namespace rocr
|
||||
|
||||
#endif // HSA_RUNTIME_CORE_SUTIL_LOCKS_H_
|
||||
|
||||
@@ -286,11 +286,6 @@ namespace code {
|
||||
}
|
||||
}
|
||||
|
||||
AmdHsaCode::~AmdHsaCode()
|
||||
{
|
||||
for (Symbol* sym : symbols) { delete sym; }
|
||||
}
|
||||
|
||||
bool AmdHsaCode::PullElf()
|
||||
{
|
||||
uint32_t majorVersion, minorVersion;
|
||||
@@ -330,7 +325,7 @@ namespace code {
|
||||
}
|
||||
for (size_t i = 0; i < img->symtab()->symbolCount(); ++i) {
|
||||
amd::elf::Symbol* elfsym = img->symtab()->symbol(i);
|
||||
Symbol* sym = 0;
|
||||
std::shared_ptr<Symbol> sym;
|
||||
switch (elfsym->type()) {
|
||||
case STT_AMDGPU_HSA_KERNEL: {
|
||||
amd::elf::Section* sec = elfsym->section();
|
||||
@@ -347,12 +342,12 @@ namespace code {
|
||||
out << "Failed to get AMD Kernel Code for symbol " << elfsym->name() << std::endl;
|
||||
return false;
|
||||
}
|
||||
sym = new KernelSymbol(elfsym, &akc);
|
||||
sym = std::make_shared<KernelSymbol>(elfsym, &akc);
|
||||
break;
|
||||
}
|
||||
case STT_OBJECT:
|
||||
case STT_COMMON:
|
||||
sym = new VariableSymbol(elfsym);
|
||||
sym = std::make_shared<VariableSymbol>(elfsym);
|
||||
break;
|
||||
default:
|
||||
break; // Skip unknown symbols.
|
||||
@@ -924,9 +919,9 @@ namespace code {
|
||||
std::string(module_name ? module_name : ""),
|
||||
std::string(symbol_name)
|
||||
);
|
||||
for (Symbol* sym : symbols) {
|
||||
for (const auto& sym : symbols) {
|
||||
if (sym->Name() == mname) {
|
||||
*s = Symbol::ToHandle(sym);
|
||||
*s = Symbol::ToHandle(sym.get());
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
}
|
||||
@@ -940,8 +935,8 @@ namespace code {
|
||||
void* data),
|
||||
void* data)
|
||||
{
|
||||
for (Symbol* sym : symbols) {
|
||||
hsa_code_symbol_t s = Symbol::ToHandle(sym);
|
||||
for (const auto& sym : symbols) {
|
||||
hsa_code_symbol_t s = Symbol::ToHandle(sym.get());
|
||||
hsa_status_t status = callback(code_object, s, data);
|
||||
if (status != HSA_STATUS_SUCCESS) { return status; }
|
||||
}
|
||||
@@ -1144,8 +1139,8 @@ namespace code {
|
||||
{
|
||||
if (nullptr == img) { return nullptr; }
|
||||
if (!section) { section = HsaText(); }
|
||||
symbols.push_back(new KernelSymbol(img->symtab()->addSymbol(section, name, 0, 0, type, binding, other), nullptr));
|
||||
return symbols.back();
|
||||
symbols.push_back(std::make_shared<KernelSymbol>(img->symtab()->addSymbol(section, name, 0, 0, type, binding, other), nullptr));
|
||||
return symbols.back().get();
|
||||
}
|
||||
|
||||
Symbol* AmdHsaCode::AddVariableSymbol(const std::string &name,
|
||||
@@ -1157,8 +1152,8 @@ namespace code {
|
||||
uint64_t size)
|
||||
{
|
||||
if (nullptr == img) { return nullptr; }
|
||||
symbols.push_back(new VariableSymbol(img->symtab()->addSymbol(section, name, value, size, type, binding, other)));
|
||||
return symbols.back();
|
||||
symbols.push_back(std::make_shared<VariableSymbol>(img->symtab()->addSymbol(section, name, value, size, type, binding, other)));
|
||||
return symbols.back().get();
|
||||
}
|
||||
|
||||
void AmdHsaCode::AddSectionSymbols()
|
||||
@@ -1166,16 +1161,16 @@ namespace code {
|
||||
if (nullptr == img) { return; }
|
||||
for (size_t i = 0; i < dataSections.size(); ++i) {
|
||||
if (dataSections[i] && dataSections[i]->flags() & SHF_ALLOC) {
|
||||
symbols.push_back(new VariableSymbol(img->symtab()->addSymbol(dataSections[i], "__hsa_section" + dataSections[i]->Name(), 0, 0, STT_SECTION, STB_LOCAL)));
|
||||
symbols.push_back(std::make_shared<VariableSymbol>(img->symtab()->addSymbol(dataSections[i], "__hsa_section" + dataSections[i]->Name(), 0, 0, STT_SECTION, STB_LOCAL)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Symbol* AmdHsaCode::GetSymbolByElfIndex(size_t index)
|
||||
{
|
||||
for (auto &s : symbols) {
|
||||
for (const auto &s : symbols) {
|
||||
if (s && index == s->Index()) {
|
||||
return s;
|
||||
return s.get();
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
@@ -1185,7 +1180,7 @@ namespace code {
|
||||
{
|
||||
for (auto &s : symbols) {
|
||||
if (s && n == s->Name()) {
|
||||
return s;
|
||||
return s.get();
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
@@ -1747,14 +1742,13 @@ namespace code {
|
||||
return false;
|
||||
}
|
||||
|
||||
AmdHsaCode* AmdHsaCodeManager::FromHandle(hsa_code_object_t c)
|
||||
const std::shared_ptr<AmdHsaCode>& AmdHsaCodeManager::FromHandle(hsa_code_object_t c)
|
||||
{
|
||||
CodeMap::iterator i = codeMap.find(c.handle);
|
||||
if (i == codeMap.end()) {
|
||||
AmdHsaCode* code = new AmdHsaCode();
|
||||
std::shared_ptr<AmdHsaCode> code = std::make_shared<AmdHsaCode>();
|
||||
const void* buffer = reinterpret_cast<const void*>(c.handle);
|
||||
if (!code->InitAsBuffer(buffer, 0)) {
|
||||
delete code;
|
||||
return 0;
|
||||
}
|
||||
codeMap[c.handle] = code;
|
||||
@@ -1770,7 +1764,7 @@ namespace code {
|
||||
// Currently, we do not always create map entry for every code object buffer.
|
||||
return true;
|
||||
}
|
||||
delete i->second;
|
||||
i->second.reset();
|
||||
codeMap.erase(i);
|
||||
return true;
|
||||
}
|
||||
@@ -1798,7 +1792,7 @@ namespace code {
|
||||
}
|
||||
for (size_t i = 0; i < img->getSymbolTable()->symbolCount(); ++i) {
|
||||
amd::elf::Symbol* elfsym = img->getSymbolTable()->symbol(i);
|
||||
Symbol* sym = 0;
|
||||
std::shared_ptr<Symbol> sym;
|
||||
switch (elfsym->type()) {
|
||||
case STT_AMDGPU_HSA_KERNEL: {
|
||||
amd::elf::Section* sec = elfsym->section();
|
||||
@@ -1815,12 +1809,12 @@ namespace code {
|
||||
out << "Failed to get AMD Kernel Code for symbol " << elfsym->name() << std::endl;
|
||||
return false;
|
||||
}
|
||||
sym = new KernelSymbolV2(elfsym, &akc);
|
||||
sym = std::make_shared<KernelSymbolV2>(elfsym, &akc);
|
||||
break;
|
||||
}
|
||||
case STT_OBJECT:
|
||||
case STT_COMMON:
|
||||
sym = new VariableSymbolV2(elfsym);
|
||||
sym = std::make_shared<VariableSymbolV2>(elfsym);
|
||||
break;
|
||||
default:
|
||||
break; // Skip unknown symbols.
|
||||
|
||||
@@ -186,7 +186,6 @@ void Loader::Destroy(Loader *loader)
|
||||
_amdgpu_r_debug.r_map = nullptr;
|
||||
_amdgpu_r_debug.r_state = r_debug::RT_CONSISTENT;
|
||||
r_debug_tail() = nullptr;
|
||||
delete loader;
|
||||
}
|
||||
|
||||
Executable* AmdHsaCodeLoader::CreateExecutable(
|
||||
@@ -194,8 +193,8 @@ Executable* AmdHsaCodeLoader::CreateExecutable(
|
||||
{
|
||||
WriterLockGuard<ReaderWriterLock> writer_lock(rw_lock_);
|
||||
|
||||
executables.push_back(new ExecutableImpl(profile, context, executables.size(), default_float_rounding_mode));
|
||||
return executables.back();
|
||||
executables.push_back(std::make_shared<ExecutableImpl>(profile, context, executables.size(), default_float_rounding_mode));
|
||||
return executables.back().get();
|
||||
}
|
||||
|
||||
Executable* AmdHsaCodeLoader::CreateExecutable(
|
||||
@@ -206,8 +205,8 @@ Executable* AmdHsaCodeLoader::CreateExecutable(
|
||||
{
|
||||
WriterLockGuard<ReaderWriterLock> writer_lock(rw_lock_);
|
||||
|
||||
executables.push_back(new ExecutableImpl(profile, std::move(isolated_context), executables.size(), default_float_rounding_mode));
|
||||
return executables.back();
|
||||
executables.push_back(std::make_shared<ExecutableImpl>(profile, std::move(isolated_context), executables.size(), default_float_rounding_mode));
|
||||
return executables.back().get();
|
||||
}
|
||||
|
||||
static void AddCodeObjectInfoIntoDebugMap(link_map* map) {
|
||||
@@ -254,7 +253,7 @@ hsa_status_t AmdHsaCodeLoader::FreezeExecutable(Executable *executable, const ch
|
||||
atomic::Fence(std::memory_order_acq_rel);
|
||||
_loader_debug_state();
|
||||
atomic::Fence(std::memory_order_acq_rel);
|
||||
for (auto &lco : reinterpret_cast<ExecutableImpl*>(executable)->loaded_code_objects) {
|
||||
for (const auto &lco : reinterpret_cast<ExecutableImpl*>(executable)->loaded_code_objects) {
|
||||
AddCodeObjectInfoIntoDebugMap(&(lco->r_debug_info));
|
||||
}
|
||||
atomic::Store(&_amdgpu_r_debug.r_state, r_debug::RT_CONSISTENT, std::memory_order_release);
|
||||
@@ -270,14 +269,13 @@ void AmdHsaCodeLoader::DestroyExecutable(Executable *executable) {
|
||||
atomic::Fence(std::memory_order_acq_rel);
|
||||
_loader_debug_state();
|
||||
atomic::Fence(std::memory_order_acq_rel);
|
||||
for (auto &lco : reinterpret_cast<ExecutableImpl*>(executable)->loaded_code_objects) {
|
||||
for (const auto &lco : reinterpret_cast<ExecutableImpl*>(executable)->loaded_code_objects) {
|
||||
RemoveCodeObjectInfoFromDebugMap(&(lco->r_debug_info));
|
||||
}
|
||||
atomic::Store(&_amdgpu_r_debug.r_state, r_debug::RT_CONSISTENT, std::memory_order_release);
|
||||
_loader_debug_state();
|
||||
|
||||
executables[((ExecutableImpl*)executable)->id()] = nullptr;
|
||||
delete executable;
|
||||
executables[static_cast<ExecutableImpl*>(executable)->id()].reset();
|
||||
}
|
||||
|
||||
hsa_status_t AmdHsaCodeLoader::IterateExecutables(
|
||||
@@ -289,9 +287,9 @@ hsa_status_t AmdHsaCodeLoader::IterateExecutables(
|
||||
WriterLockGuard<ReaderWriterLock> writer_lock(rw_lock_);
|
||||
assert(callback);
|
||||
|
||||
for (auto &exec : executables) {
|
||||
for (const auto &exec : executables) {
|
||||
if(exec != nullptr){
|
||||
hsa_status_t status = callback(Executable::Handle(exec), data);
|
||||
hsa_status_t status = callback(Executable::Handle(exec.get()), data);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
return status;
|
||||
}
|
||||
@@ -318,7 +316,7 @@ hsa_status_t AmdHsaCodeLoader::QuerySegmentDescriptors(
|
||||
this->EnableReadOnlyMode();
|
||||
|
||||
size_t actual_num_segment_descriptors = 0;
|
||||
for (auto &executable : executables) {
|
||||
for (const auto &executable : executables) {
|
||||
if (executable) {
|
||||
actual_num_segment_descriptors += executable->GetNumSegmentDescriptors();
|
||||
}
|
||||
@@ -335,7 +333,7 @@ hsa_status_t AmdHsaCodeLoader::QuerySegmentDescriptors(
|
||||
}
|
||||
|
||||
size_t i = 0;
|
||||
for (auto &executable : executables) {
|
||||
for (const auto &executable : executables) {
|
||||
if (executable) {
|
||||
i += executable->QuerySegmentDescriptors(segment_descriptors, actual_num_segment_descriptors, i);
|
||||
}
|
||||
@@ -352,7 +350,7 @@ uint64_t AmdHsaCodeLoader::FindHostAddress(uint64_t device_address)
|
||||
return 0;
|
||||
}
|
||||
|
||||
for (auto &exec : executables) {
|
||||
for (const auto &exec : executables) {
|
||||
if (exec != nullptr) {
|
||||
uint64_t host_address = exec->FindHostAddress(device_address);
|
||||
if (host_address != 0) {
|
||||
@@ -371,9 +369,9 @@ void AmdHsaCodeLoader::PrintHelp(std::ostream& out)
|
||||
void AmdHsaCodeLoader::EnableReadOnlyMode()
|
||||
{
|
||||
rw_lock_.ReaderLock();
|
||||
for (auto &executable : executables) {
|
||||
for (const auto &executable : executables) {
|
||||
if (executable) {
|
||||
((ExecutableImpl*)executable)->EnableReadOnlyMode();
|
||||
((ExecutableImpl*)executable.get())->EnableReadOnlyMode();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -381,9 +379,9 @@ void AmdHsaCodeLoader::EnableReadOnlyMode()
|
||||
void AmdHsaCodeLoader::DisableReadOnlyMode()
|
||||
{
|
||||
rw_lock_.ReaderUnlock();
|
||||
for (auto &executable : executables) {
|
||||
for (const auto &executable : executables) {
|
||||
if (executable) {
|
||||
((ExecutableImpl*)executable)->DisableReadOnlyMode();
|
||||
((ExecutableImpl*)executable.get())->DisableReadOnlyMode();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -781,18 +779,10 @@ ExecutableImpl::ExecutableImpl(
|
||||
}
|
||||
|
||||
ExecutableImpl::~ExecutableImpl() {
|
||||
for (ExecutableObject* o : objects) {
|
||||
for (const auto& o : objects) {
|
||||
o->Destroy();
|
||||
delete o;
|
||||
}
|
||||
objects.clear();
|
||||
|
||||
for (auto &symbol_entry : program_symbols_) {
|
||||
delete symbol_entry.second;
|
||||
}
|
||||
for (auto &symbol_entry : agent_symbols_) {
|
||||
delete symbol_entry.second;
|
||||
}
|
||||
}
|
||||
|
||||
hsa_status_t ExecutableImpl::DefineProgramExternalVariable(
|
||||
@@ -812,7 +802,7 @@ hsa_status_t ExecutableImpl::DefineProgramExternalVariable(
|
||||
|
||||
program_symbols_.insert(
|
||||
std::make_pair(std::string(name),
|
||||
new VariableSymbol(true,
|
||||
std::make_shared<VariableSymbol>(true,
|
||||
"", // Only program linkage symbols can be
|
||||
// defined.
|
||||
std::string(name),
|
||||
@@ -848,7 +838,7 @@ hsa_status_t ExecutableImpl::DefineAgentExternalVariable(
|
||||
|
||||
auto insert_status = agent_symbols_.insert(
|
||||
std::make_pair(std::make_pair(std::string(name), agent),
|
||||
new VariableSymbol(true,
|
||||
std::make_shared<VariableSymbol>(true,
|
||||
"", // Only program linkage symbols can be
|
||||
// defined.
|
||||
std::string(name),
|
||||
@@ -896,14 +886,14 @@ Symbol* ExecutableImpl::GetSymbolInternal(
|
||||
if (!agent) {
|
||||
auto program_symbol = program_symbols_.find(mangled_name);
|
||||
if (program_symbol != program_symbols_.end()) {
|
||||
return program_symbol->second;
|
||||
return program_symbol->second.get();
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto agent_symbol = agent_symbols_.find(std::make_pair(mangled_name, *agent));
|
||||
if (agent_symbol != agent_symbols_.end()) {
|
||||
return agent_symbol->second;
|
||||
return agent_symbol->second.get();
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
@@ -916,14 +906,14 @@ hsa_status_t ExecutableImpl::IterateSymbols(
|
||||
|
||||
for (auto &symbol_entry : program_symbols_) {
|
||||
hsa_status_t hsc =
|
||||
callback(Executable::Handle(this), Symbol::Handle(symbol_entry.second), data);
|
||||
callback(Executable::Handle(this), Symbol::Handle(symbol_entry.second.get()), data);
|
||||
if (HSA_STATUS_SUCCESS != hsc) {
|
||||
return hsc;
|
||||
}
|
||||
}
|
||||
for (auto &symbol_entry : agent_symbols_) {
|
||||
hsa_status_t hsc =
|
||||
callback(Executable::Handle(this), Symbol::Handle(symbol_entry.second), data);
|
||||
callback(Executable::Handle(this), Symbol::Handle(symbol_entry.second.get()), data);
|
||||
if (HSA_STATUS_SUCCESS != hsc) {
|
||||
return hsc;
|
||||
}
|
||||
@@ -948,7 +938,7 @@ hsa_status_t ExecutableImpl::IterateAgentSymbols(
|
||||
}
|
||||
|
||||
hsa_status_t status = callback(
|
||||
Executable::Handle(this), agent, Symbol::Handle(symbol_entry.second),
|
||||
Executable::Handle(this), agent, Symbol::Handle(symbol_entry.second.get()),
|
||||
data);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
return status;
|
||||
@@ -968,7 +958,7 @@ hsa_status_t ExecutableImpl::IterateProgramSymbols(
|
||||
|
||||
for (auto &symbol_entry : program_symbols_) {
|
||||
hsa_status_t status = callback(
|
||||
Executable::Handle(this), Symbol::Handle(symbol_entry.second), data);
|
||||
Executable::Handle(this), Symbol::Handle(symbol_entry.second.get()), data);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
return status;
|
||||
}
|
||||
@@ -987,10 +977,10 @@ hsa_status_t ExecutableImpl::IterateLoadedCodeObjects(
|
||||
ReaderLockGuard<ReaderWriterLock> reader_lock(rw_lock_);
|
||||
assert(callback);
|
||||
|
||||
for (auto &loaded_code_object : loaded_code_objects) {
|
||||
for (const auto& loaded_code_object : loaded_code_objects) {
|
||||
hsa_status_t status = callback(
|
||||
Executable::Handle(this),
|
||||
LoadedCodeObject::Handle(loaded_code_object),
|
||||
LoadedCodeObject::Handle(loaded_code_object.get()),
|
||||
data);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
return status;
|
||||
@@ -1004,7 +994,7 @@ size_t ExecutableImpl::GetNumSegmentDescriptors()
|
||||
{
|
||||
// assuming we are in readonly mode.
|
||||
size_t actual_num_segment_descriptors = 0;
|
||||
for (auto &obj : loaded_code_objects) {
|
||||
for (const auto &obj : loaded_code_objects) {
|
||||
actual_num_segment_descriptors += obj->LoadedSegments().size();
|
||||
}
|
||||
return actual_num_segment_descriptors;
|
||||
@@ -1020,7 +1010,7 @@ size_t ExecutableImpl::QuerySegmentDescriptors(
|
||||
assert(first_empty_segment_descriptor < total_num_segment_descriptors);
|
||||
|
||||
size_t i = first_empty_segment_descriptor;
|
||||
for (auto &obj : loaded_code_objects) {
|
||||
for (const auto &obj : loaded_code_objects) {
|
||||
assert(i < total_num_segment_descriptors);
|
||||
for (auto &seg : obj->LoadedSegments()) {
|
||||
segment_descriptors[i].agent = seg->Agent();
|
||||
@@ -1084,11 +1074,11 @@ hsa_executable_t AmdHsaCodeLoader::FindExecutable(uint64_t device_address)
|
||||
return execHandle;
|
||||
}
|
||||
|
||||
for (auto &exec : executables) {
|
||||
for (const auto &exec : executables) {
|
||||
if (exec != nullptr) {
|
||||
uint64_t host_address = exec->FindHostAddress(device_address);
|
||||
if (host_address != 0) {
|
||||
return Executable::Handle(exec);
|
||||
return Executable::Handle(exec.get());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1098,7 +1088,7 @@ hsa_executable_t AmdHsaCodeLoader::FindExecutable(uint64_t device_address)
|
||||
uint64_t ExecutableImpl::FindHostAddress(uint64_t device_address)
|
||||
{
|
||||
ReaderLockGuard<ReaderWriterLock> reader_lock(rw_lock_);
|
||||
for (auto &obj : loaded_code_objects) {
|
||||
for (const auto &obj : loaded_code_objects) {
|
||||
assert(obj);
|
||||
for (auto &seg : obj->LoadedSegments()) {
|
||||
assert(seg);
|
||||
@@ -1224,7 +1214,7 @@ hsa_status_t ExecutableImpl::LoadCodeObject(
|
||||
|
||||
uint32_t codeNum = NextCodeObjectNum();
|
||||
|
||||
code.reset(new code::AmdHsaCode());
|
||||
code = std::make_unique<code::AmdHsaCode>();
|
||||
|
||||
std::string substituteFileName;
|
||||
for (const Substitute& ss : substitutes) {
|
||||
@@ -1306,8 +1296,8 @@ hsa_status_t ExecutableImpl::LoadCodeObject(
|
||||
|
||||
hsa_status_t status;
|
||||
|
||||
objects.push_back(new LoadedCodeObjectImpl(this, agent, code->ElfData(), code->ElfSize()));
|
||||
loaded_code_objects.push_back((LoadedCodeObjectImpl*)objects.back());
|
||||
objects.push_back(std::make_shared<LoadedCodeObjectImpl>(this, agent, code->ElfData(), code->ElfSize()));
|
||||
loaded_code_objects.push_back(std::static_pointer_cast<LoadedCodeObjectImpl>(objects.back()));
|
||||
|
||||
status = LoadSegments(agent, code.get(), majorVersion);
|
||||
if (status != HSA_STATUS_SUCCESS) return status;
|
||||
@@ -1338,7 +1328,7 @@ hsa_status_t ExecutableImpl::LoadCodeObject(
|
||||
loaded_code_objects.back()->r_debug_info.l_prev = nullptr;
|
||||
loaded_code_objects.back()->r_debug_info.l_next = nullptr;
|
||||
|
||||
if (nullptr != loaded_code_object) { *loaded_code_object = LoadedCodeObject::Handle(loaded_code_objects.back()); }
|
||||
if (nullptr != loaded_code_object) { *loaded_code_object = LoadedCodeObject::Handle(loaded_code_objects.back().get()); }
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -1376,18 +1366,18 @@ hsa_status_t ExecutableImpl::LoadSegmentsV2(hsa_agent_t agent,
|
||||
AMD_ISA_ALIGN_BYTES, true);
|
||||
if (!ptr) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
|
||||
Segment *load_segment = new Segment(this, agent, AMDGPU_HSA_SEGMENT_CODE_AGENT,
|
||||
std::shared_ptr<Segment> load_segment = std::make_shared<Segment>(this, agent, AMDGPU_HSA_SEGMENT_CODE_AGENT,
|
||||
ptr, size, vaddr, c->DataSegment(0)->offset());
|
||||
if (!load_segment) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
|
||||
hsa_status_t status = HSA_STATUS_SUCCESS;
|
||||
for (size_t i = 0; i < c->DataSegmentCount(); ++i) {
|
||||
status = LoadSegmentV2(c->DataSegment(i), load_segment);
|
||||
status = LoadSegmentV2(c->DataSegment(i), load_segment.get());
|
||||
if (status != HSA_STATUS_SUCCESS) return status;
|
||||
}
|
||||
|
||||
objects.push_back(load_segment);
|
||||
loaded_code_objects.back()->LoadedSegments().push_back(load_segment);
|
||||
loaded_code_objects.back()->LoadedSegments().push_back(load_segment.get());
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
@@ -1398,7 +1388,7 @@ hsa_status_t ExecutableImpl::LoadSegmentV1(hsa_agent_t agent,
|
||||
if (s->memSize() == 0)
|
||||
return HSA_STATUS_SUCCESS;
|
||||
amdgpu_hsa_elf_segment_t segment = (amdgpu_hsa_elf_segment_t)(s->type() - PT_LOOS);
|
||||
Segment *new_seg = nullptr;
|
||||
std::shared_ptr<Segment> new_seg;
|
||||
bool need_alloc = true;
|
||||
if (segment == AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM && nullptr != program_allocation_segment) {
|
||||
new_seg = program_allocation_segment;
|
||||
@@ -1407,7 +1397,7 @@ hsa_status_t ExecutableImpl::LoadSegmentV1(hsa_agent_t agent,
|
||||
if (need_alloc) {
|
||||
void* ptr = context_->SegmentAlloc(segment, agent, s->memSize(), s->align(), true);
|
||||
if (!ptr) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; }
|
||||
new_seg = new Segment(this, agent, segment, ptr, s->memSize(), s->vaddr(), s->offset());
|
||||
new_seg = std::make_shared<Segment>(this, agent, segment, ptr, s->memSize(), s->vaddr(), s->offset());
|
||||
new_seg->Copy(s->vaddr(), s->data(), s->imageSize());
|
||||
objects.push_back(new_seg);
|
||||
|
||||
@@ -1416,7 +1406,7 @@ hsa_status_t ExecutableImpl::LoadSegmentV1(hsa_agent_t agent,
|
||||
}
|
||||
}
|
||||
assert(new_seg);
|
||||
loaded_code_objects.back()->LoadedSegments().push_back(new_seg);
|
||||
loaded_code_objects.back()->LoadedSegments().push_back(new_seg.get());
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -1471,7 +1461,7 @@ hsa_status_t ExecutableImpl::LoadDefinitionSymbol(hsa_agent_t agent,
|
||||
}
|
||||
|
||||
uint64_t address = SymbolAddress(agent, sym);
|
||||
SymbolImpl *symbol = nullptr;
|
||||
std::shared_ptr<SymbolImpl> symbol;
|
||||
if (string_ends_with(sym->GetSymbolName(), ".kd")) {
|
||||
// V3.
|
||||
llvm::amdhsa::kernel_descriptor_t kd;
|
||||
@@ -1486,7 +1476,7 @@ hsa_status_t ExecutableImpl::LoadDefinitionSymbol(hsa_agent_t agent,
|
||||
|
||||
uint64_t size = sym->Size();
|
||||
|
||||
KernelSymbol *kernel_symbol = new KernelSymbol(true,
|
||||
std::shared_ptr<KernelSymbol> kernel_symbol = std::make_shared<KernelSymbol>(true,
|
||||
sym->GetModuleName(),
|
||||
sym->GetSymbolName(),
|
||||
sym->Linkage(),
|
||||
@@ -1502,7 +1492,7 @@ hsa_status_t ExecutableImpl::LoadDefinitionSymbol(hsa_agent_t agent,
|
||||
address);
|
||||
symbol = kernel_symbol;
|
||||
} else if (sym->IsVariableSymbol()) {
|
||||
symbol = new VariableSymbol(true,
|
||||
symbol = std::make_shared<VariableSymbol>(true,
|
||||
sym->GetModuleName(),
|
||||
sym->GetSymbolName(),
|
||||
sym->Linkage(),
|
||||
@@ -1537,7 +1527,7 @@ hsa_status_t ExecutableImpl::LoadDefinitionSymbol(hsa_agent_t agent,
|
||||
// calculate end of segment - symbol value.
|
||||
size = sym->GetSection()->size() - sym->SectionOffset();
|
||||
}
|
||||
KernelSymbol *kernel_symbol = new KernelSymbol(true,
|
||||
std::shared_ptr<KernelSymbol> kernel_symbol = std::make_shared<KernelSymbol>(true,
|
||||
sym->GetModuleName(),
|
||||
sym->GetSymbolName(),
|
||||
sym->Linkage(),
|
||||
@@ -1970,7 +1960,7 @@ void ExecutableImpl::Print(std::ostream& out)
|
||||
<< std::endl << std::endl;
|
||||
out << "Loaded Objects (total " << objects.size() << ")" << std::endl;
|
||||
size_t i = 0;
|
||||
for (ExecutableObject* o : objects) {
|
||||
for (const auto& o : objects) {
|
||||
out << "Loaded Object " << i++ << ": ";
|
||||
o->Print(out);
|
||||
out << std::endl;
|
||||
|
||||
@@ -461,7 +461,7 @@ public:
|
||||
};
|
||||
|
||||
typedef std::string ProgramSymbol;
|
||||
typedef std::unordered_map<ProgramSymbol, SymbolImpl*> ProgramSymbolMap;
|
||||
typedef std::unordered_map<ProgramSymbol, std::shared_ptr<SymbolImpl>> ProgramSymbolMap;
|
||||
|
||||
typedef std::pair<std::string, hsa_agent_t> AgentSymbol;
|
||||
struct ASC {
|
||||
@@ -476,7 +476,7 @@ struct ASH {
|
||||
return h ^ (i << 1);
|
||||
}
|
||||
};
|
||||
typedef std::unordered_map<AgentSymbol, SymbolImpl*, ASH, ASC> AgentSymbolMap;
|
||||
typedef std::unordered_map<AgentSymbol, std::shared_ptr<SymbolImpl>, ASH, ASC> AgentSymbolMap;
|
||||
|
||||
class ExecutableImpl final: public Executable {
|
||||
friend class AmdHsaCodeLoader;
|
||||
@@ -634,15 +634,15 @@ private:
|
||||
|
||||
ProgramSymbolMap program_symbols_;
|
||||
AgentSymbolMap agent_symbols_;
|
||||
std::vector<ExecutableObject*> objects;
|
||||
Segment *program_allocation_segment;
|
||||
std::vector<LoadedCodeObjectImpl*> loaded_code_objects;
|
||||
std::vector<std::shared_ptr<ExecutableObject>> objects;
|
||||
std::shared_ptr<Segment> program_allocation_segment;
|
||||
std::vector<std::shared_ptr<LoadedCodeObjectImpl>> loaded_code_objects;
|
||||
};
|
||||
|
||||
class AmdHsaCodeLoader : public Loader {
|
||||
private:
|
||||
Context* context;
|
||||
std::vector<Executable*> executables;
|
||||
std::vector<std::shared_ptr<Executable>> executables;
|
||||
amd::hsa::common::ReaderWriterLock rw_lock_;
|
||||
|
||||
public:
|
||||
|
||||
@@ -282,7 +282,7 @@ hsa_status_t PcsRuntime::PcSamplingCreateInternal(
|
||||
size_t interval, size_t latency, size_t buffer_size,
|
||||
hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb, void* client_cb_data,
|
||||
hsa_ven_amd_pcs_t* handle, agent_pcs_create_fn_t agent_pcs_create_fn) {
|
||||
ScopedAcquire<KernelMutex> lock(&pc_sampling_lock_);
|
||||
std::lock_guard<std::mutex> lock(pc_sampling_lock_);
|
||||
|
||||
handle->handle = ++pc_sampling_id_;
|
||||
// create a new PcSamplingSession(agent, method, units, interval, latency, buffer_size,
|
||||
@@ -305,7 +305,7 @@ hsa_status_t PcsRuntime::PcSamplingCreateInternal(
|
||||
}
|
||||
|
||||
hsa_status_t PcsRuntime::PcSamplingDestroy(hsa_ven_amd_pcs_t handle) {
|
||||
ScopedAcquire<KernelMutex> lock(&pc_sampling_lock_);
|
||||
std::lock_guard<std::mutex> lock(pc_sampling_lock_);
|
||||
auto pcSamplingSessionIt = pc_sampling_.find(static_cast<uint64_t>(handle.handle));
|
||||
if (pcSamplingSessionIt == pc_sampling_.end()) {
|
||||
debug_warning(false && "Cannot find PcSampling session");
|
||||
@@ -319,7 +319,7 @@ hsa_status_t PcsRuntime::PcSamplingDestroy(hsa_ven_amd_pcs_t handle) {
|
||||
}
|
||||
|
||||
hsa_status_t PcsRuntime::PcSamplingStart(hsa_ven_amd_pcs_t handle) {
|
||||
ScopedAcquire<KernelMutex> lock(&pc_sampling_lock_);
|
||||
std::lock_guard<std::mutex> lock(pc_sampling_lock_);
|
||||
auto pcSamplingSessionIt = pc_sampling_.find(static_cast<uint64_t>(handle.handle));
|
||||
if (pcSamplingSessionIt == pc_sampling_.end()) {
|
||||
debug_warning(false && "Cannot find PcSampling session");
|
||||
@@ -331,7 +331,7 @@ hsa_status_t PcsRuntime::PcSamplingStart(hsa_ven_amd_pcs_t handle) {
|
||||
}
|
||||
|
||||
hsa_status_t PcsRuntime::PcSamplingStop(hsa_ven_amd_pcs_t handle) {
|
||||
ScopedAcquire<KernelMutex> lock(&pc_sampling_lock_);
|
||||
std::lock_guard<std::mutex> lock(pc_sampling_lock_);
|
||||
auto pcSamplingSessionIt = pc_sampling_.find(static_cast<uint64_t>(handle.handle));
|
||||
if (pcSamplingSessionIt == pc_sampling_.end()) {
|
||||
debug_warning(false && "Cannot find PcSampling session");
|
||||
@@ -343,7 +343,7 @@ hsa_status_t PcsRuntime::PcSamplingStop(hsa_ven_amd_pcs_t handle) {
|
||||
}
|
||||
|
||||
hsa_status_t PcsRuntime::PcSamplingFlush(hsa_ven_amd_pcs_t handle) {
|
||||
ScopedAcquire<KernelMutex> lock(&pc_sampling_lock_);
|
||||
std::lock_guard<std::mutex> lock(pc_sampling_lock_);
|
||||
auto pcSamplingSessionIt = pc_sampling_.find(static_cast<uint64_t>(handle.handle));
|
||||
if (pcSamplingSessionIt == pc_sampling_.end()) {
|
||||
debug_warning(false && "Cannot find PcSampling session");
|
||||
|
||||
@@ -166,7 +166,7 @@ class PcsRuntime {
|
||||
}
|
||||
// Map of pc sampling sessions indexed by hsa_ven_amd_pcs_t handle
|
||||
std::map<uint64_t, PcSamplingSession> pc_sampling_;
|
||||
KernelMutex pc_sampling_lock_;
|
||||
std::mutex pc_sampling_lock_;
|
||||
uint64_t pc_sampling_id_;
|
||||
|
||||
DISALLOW_COPY_AND_ASSIGN(PcsRuntime);
|
||||
|
||||
Referência em uma Nova Issue
Bloquear um usuário