Temporary: Set AllocateGTTAccess and node_id for MES
Temporary change to set the AllocateGTTAccess flag and node_id on MES devices. Change-Id: I22385d11b17b76cfb44278fa0d8a09bc8721cea6
Cette révision appartient à :
@@ -44,7 +44,7 @@
|
||||
|
||||
namespace rocr {
|
||||
namespace core {
|
||||
std::function<void*(size_t, size_t, uint32_t)> BaseShared::allocate_ = nullptr;
|
||||
std::function<void*(size_t, size_t, uint32_t, int)> BaseShared::allocate_ = nullptr;
|
||||
std::function<void(void*)> BaseShared::free_ = nullptr;
|
||||
} // namespace core
|
||||
} // namespace rocr
|
||||
|
||||
@@ -58,14 +58,14 @@ namespace core {
|
||||
class BaseShared {
|
||||
public:
|
||||
static void SetAllocateAndFree(
|
||||
const std::function<void*(size_t, size_t, uint32_t)>& allocate,
|
||||
const std::function<void*(size_t, size_t, uint32_t, int)>& allocate,
|
||||
const std::function<void(void*)>& free) {
|
||||
allocate_ = allocate;
|
||||
free_ = free;
|
||||
}
|
||||
|
||||
protected:
|
||||
static std::function<void*(size_t, size_t, uint32_t)> allocate_;
|
||||
static std::function<void*(size_t, size_t, uint32_t, int)> allocate_;
|
||||
static std::function<void(void*)> free_;
|
||||
};
|
||||
|
||||
@@ -73,7 +73,19 @@ class BaseShared {
|
||||
template <typename T> class PageAllocator : private BaseShared {
|
||||
public:
|
||||
__forceinline static T* alloc(int flags = 0) {
|
||||
T* ret = reinterpret_cast<T*>(allocate_(AlignUp(sizeof(T), 4096), 4096, flags));
|
||||
T* ret = reinterpret_cast<T*>(allocate_(AlignUp(sizeof(T), 4096), 4096, flags, 0));
|
||||
if (ret == nullptr) throw std::bad_alloc();
|
||||
|
||||
MAKE_NAMED_SCOPE_GUARD(throwGuard, [&]() { free_(ret); });
|
||||
|
||||
new (ret) T;
|
||||
|
||||
throwGuard.Dismiss();
|
||||
return ret;
|
||||
}
|
||||
|
||||
__forceinline static T* alloc(int agent_node_id, int flags) {
|
||||
T* ret = reinterpret_cast<T*>(allocate_(AlignUp(sizeof(T), 4096), 4096, flags, agent_node_id));
|
||||
if (ret == nullptr) throw std::bad_alloc();
|
||||
|
||||
MAKE_NAMED_SCOPE_GUARD(throwGuard, [&]() { free_(ret); });
|
||||
@@ -107,6 +119,16 @@ class Shared final : private BaseShared {
|
||||
shared_object_ = PageAllocator<T>::alloc(flags);
|
||||
}
|
||||
|
||||
explicit Shared(int agent_node_id, Allocator* pool = nullptr, int flags = 0) : pool_(pool) {
|
||||
assert(allocate_ != nullptr && free_ != nullptr &&
|
||||
"Shared object allocator is not set");
|
||||
|
||||
if (pool_)
|
||||
shared_object_ = pool_->alloc();
|
||||
else
|
||||
shared_object_ = PageAllocator<T>::alloc(agent_node_id, flags);
|
||||
}
|
||||
|
||||
~Shared() {
|
||||
assert(allocate_ != nullptr && free_ != nullptr && "Shared object allocator is not set");
|
||||
|
||||
@@ -147,6 +169,12 @@ template <typename T> class Shared<T, PageAllocator<T>> final : private BaseShar
|
||||
shared_object_ = PageAllocator<T>::alloc(flags);
|
||||
}
|
||||
|
||||
Shared(int agent_node_id, int flags) {
|
||||
assert(allocate_ != nullptr && free_ != nullptr && "Shared object allocator is not set");
|
||||
|
||||
shared_object_ = PageAllocator<T>::alloc(agent_node_id, flags);
|
||||
}
|
||||
|
||||
~Shared() {
|
||||
assert(allocate_ != nullptr && free_ != nullptr &&
|
||||
"Shared object allocator is not set");
|
||||
@@ -183,7 +211,7 @@ template <typename T, size_t Align> class SharedArray final : private BaseShared
|
||||
static_assert((__alignof(T) <= Align) || (Align == 0), "Align is less than alignof(T)");
|
||||
|
||||
shared_object_ =
|
||||
reinterpret_cast<T*>(allocate_(sizeof(T) * length, Max(__alignof(T), Align), 0));
|
||||
reinterpret_cast<T*>(allocate_(sizeof(T) * length, Max(__alignof(T), Align), 0, 0));
|
||||
if (shared_object_ == nullptr) throw std::bad_alloc();
|
||||
|
||||
size_t i = 0;
|
||||
|
||||
@@ -100,7 +100,7 @@ class MemoryRegion : public core::MemoryRegion {
|
||||
|
||||
~MemoryRegion();
|
||||
|
||||
hsa_status_t Allocate(size_t& size, AllocateFlags alloc_flags, void** address) const;
|
||||
hsa_status_t Allocate(size_t& size, AllocateFlags alloc_flags, void** address, int agent_node_id = 0) const;
|
||||
|
||||
hsa_status_t Free(void* address, size_t size) const;
|
||||
|
||||
@@ -200,7 +200,7 @@ class MemoryRegion : public core::MemoryRegion {
|
||||
const core::Runtime::LinkInfo& link_info) const;
|
||||
|
||||
// Operational body for Allocate. Recursive.
|
||||
hsa_status_t AllocateImpl(size_t& size, AllocateFlags alloc_flags, void** address) const;
|
||||
hsa_status_t AllocateImpl(size_t& size, AllocateFlags alloc_flags, void** address, int agent_node_id) const;
|
||||
|
||||
// Operational body for Free. Recursive.
|
||||
hsa_status_t FreeImpl(void* address, size_t size) const;
|
||||
|
||||
@@ -99,11 +99,15 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> {
|
||||
AllocateAsan = (1 << 6), // ASAN - First page of allocation remapped to system memory
|
||||
AllocatePinned = (1 << 7), // Currently treating Pinned memory as NoSubstitute
|
||||
AllocateMemoryOnly = (1 << 8), // Memory only handle from thunk, no virtual address
|
||||
// Flag to allocate system memory with GTT Access
|
||||
// Note: The node_id needs to be the node_id of the device even though this is allocating
|
||||
// system memory
|
||||
AllocateGTTAccess = (1 << 9),
|
||||
};
|
||||
|
||||
typedef uint32_t AllocateFlags;
|
||||
|
||||
virtual hsa_status_t Allocate(size_t& size, AllocateFlags alloc_flags, void** address) const = 0;
|
||||
virtual hsa_status_t Allocate(size_t& size, AllocateFlags alloc_flags, void** address, int agent_node_id) const = 0;
|
||||
|
||||
virtual hsa_status_t Free(void* address, size_t size) const = 0;
|
||||
|
||||
|
||||
@@ -162,6 +162,7 @@ struct SharedQueue {
|
||||
class LocalQueue {
|
||||
public:
|
||||
LocalQueue(int mem_flags) : local_queue_(mem_flags) {}
|
||||
LocalQueue(int agent_node_id, int mem_flags) : local_queue_(agent_node_id, mem_flags) {}
|
||||
SharedQueue* queue() const { return local_queue_.shared_object(); }
|
||||
|
||||
private:
|
||||
@@ -183,6 +184,11 @@ class Queue : public Checked<0xFA3906A679F9DB49>, private LocalQueue {
|
||||
public_handle_ = Convert(this);
|
||||
}
|
||||
|
||||
Queue(int agent_node_id, int mem_flags = 0) : LocalQueue(agent_node_id, mem_flags), amd_queue_(queue()->amd_queue) {
|
||||
queue()->core_queue = this;
|
||||
public_handle_ = Convert(this);
|
||||
}
|
||||
|
||||
virtual ~Queue() {}
|
||||
|
||||
virtual void Destroy() { delete this; }
|
||||
|
||||
@@ -198,7 +198,7 @@ class Runtime {
|
||||
/// @retval ::HSA_STATUS_SUCCESS If allocation is successful.
|
||||
hsa_status_t AllocateMemory(const MemoryRegion* region, size_t size,
|
||||
MemoryRegion::AllocateFlags alloc_flags,
|
||||
void** address);
|
||||
void** address, int agent_node_id = 0);
|
||||
|
||||
/// @brief Free memory previously allocated with AllocateMemory.
|
||||
///
|
||||
@@ -419,7 +419,7 @@ class Runtime {
|
||||
|
||||
amd::hsa::code::AmdHsaCodeManager* code_manager() { return &code_manager_; }
|
||||
|
||||
std::function<void*(size_t size, size_t align, MemoryRegion::AllocateFlags flags)>&
|
||||
std::function<void*(size_t size, size_t align, MemoryRegion::AllocateFlags flags, int agent_node_id)>&
|
||||
system_allocator() {
|
||||
return system_allocator_;
|
||||
}
|
||||
@@ -659,7 +659,7 @@ class Runtime {
|
||||
prefetch_map_t prefetch_map_;
|
||||
|
||||
// Allocator using ::system_region_
|
||||
std::function<void*(size_t size, size_t align, MemoryRegion::AllocateFlags flags)> system_allocator_;
|
||||
std::function<void*(size_t size, size_t align, MemoryRegion::AllocateFlags flags, int agent_node_id)> system_allocator_;
|
||||
|
||||
// Deallocator using ::system_region_
|
||||
std::function<void(void*)> system_deallocator_;
|
||||
|
||||
@@ -80,7 +80,7 @@ int AqlQueue::rtti_id_ = 0;
|
||||
|
||||
AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, ScratchInfo& scratch,
|
||||
core::HsaEventCallback callback, void* err_data, bool is_kv)
|
||||
: Queue(agent->isMES() ? MemoryRegion::AllocateNonPaged : 0),
|
||||
: Queue(agent->node_id(), agent->isMES() ? (MemoryRegion::AllocateGTTAccess | MemoryRegion::AllocateNonPaged) : 0),
|
||||
LocalSignal(0, false),
|
||||
DoorbellSignal(signal()),
|
||||
ring_buf_(nullptr),
|
||||
|
||||
@@ -59,8 +59,7 @@ namespace AMD {
|
||||
size_t MemoryRegion::max_sysmem_alloc_size_ = 0;
|
||||
size_t MemoryRegion::kPageSize_ = sysconf(_SC_PAGESIZE);
|
||||
|
||||
void* MemoryRegion::AllocateKfdMemory(const HsaMemFlags& flag,
|
||||
HSAuint32 node_id, size_t size) {
|
||||
void* MemoryRegion::AllocateKfdMemory(const HsaMemFlags& flag, HSAuint32 node_id, size_t size) {
|
||||
void* ret = NULL;
|
||||
const HSAKMT_STATUS status = hsaKmtAllocMemory(node_id, size, flag, &ret);
|
||||
return (status == HSAKMT_STATUS_SUCCESS) ? ret : NULL;
|
||||
@@ -170,13 +169,13 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile,
|
||||
|
||||
MemoryRegion::~MemoryRegion() {}
|
||||
|
||||
hsa_status_t MemoryRegion::Allocate(size_t& size, AllocateFlags alloc_flags, void** address) const {
|
||||
hsa_status_t MemoryRegion::Allocate(size_t& size, AllocateFlags alloc_flags, void** address, int agent_node_id) const {
|
||||
ScopedAcquire<KernelMutex> lock(&owner()->agent_memory_lock_);
|
||||
return AllocateImpl(size, alloc_flags, address);
|
||||
return AllocateImpl(size, alloc_flags, address, agent_node_id);
|
||||
}
|
||||
|
||||
hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags,
|
||||
void** address) const {
|
||||
void** address, int agent_node_id) const {
|
||||
if (address == NULL) {
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
}
|
||||
@@ -209,6 +208,8 @@ hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags,
|
||||
kmt_alloc_flags.ui32.CoarseGrain = (alloc_flags & AllocatePCIeRW ? 0 : kmt_alloc_flags.ui32.CoarseGrain);
|
||||
kmt_alloc_flags.ui32.NoSubstitute = (alloc_flags & AllocatePinned ? 1 : kmt_alloc_flags.ui32.NoSubstitute);
|
||||
|
||||
kmt_alloc_flags.ui32.GTTAccess = (alloc_flags & AllocateGTTAccess ? 1 : kmt_alloc_flags.ui32.GTTAccess);
|
||||
|
||||
// Only allow using the suballocator for ordinary VRAM.
|
||||
if (IsLocalMemory() && !kmt_alloc_flags.ui32.NoAddress) {
|
||||
bool subAllocEnabled = !core::Runtime::runtime_singleton_->flag().disable_fragment_alloc();
|
||||
@@ -228,12 +229,14 @@ hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags,
|
||||
}
|
||||
}
|
||||
|
||||
const HSAuint32 node_id = (alloc_flags & AllocateGTTAccess) ? agent_node_id : owner()->node_id();
|
||||
|
||||
// Allocate memory.
|
||||
// If it fails attempt to release memory from the block allocator and retry.
|
||||
*address = AllocateKfdMemory(kmt_alloc_flags, owner()->node_id(), size);
|
||||
*address = AllocateKfdMemory(kmt_alloc_flags, node_id, size);
|
||||
if (*address == nullptr) {
|
||||
owner()->Trim();
|
||||
*address = AllocateKfdMemory(kmt_alloc_flags, owner()->node_id(), size);
|
||||
*address = AllocateKfdMemory(kmt_alloc_flags, node_id, size);
|
||||
}
|
||||
|
||||
if (kmt_alloc_flags.ui32.NoAddress) return HSA_STATUS_SUCCESS;
|
||||
@@ -768,7 +771,7 @@ void* MemoryRegion::BlockAllocator::alloc(size_t request_size, size_t& allocated
|
||||
size_t bsize = AlignUp(request_size, block_size());
|
||||
|
||||
hsa_status_t err = region_.AllocateImpl(
|
||||
bsize, core::MemoryRegion::AllocateRestrict | core::MemoryRegion::AllocateDirect, &ret);
|
||||
bsize, core::MemoryRegion::AllocateRestrict | core::MemoryRegion::AllocateDirect, &ret, 0);
|
||||
if (err != HSA_STATUS_SUCCESS)
|
||||
throw AMD::hsa_exception(err, "MemoryRegion::BlockAllocator::alloc failed.");
|
||||
assert(ret != nullptr && "Region returned nullptr on success.");
|
||||
|
||||
@@ -208,12 +208,12 @@ void Runtime::RegisterAgent(Agent* agent, bool Enabled) {
|
||||
for (auto pool : system_regions_fine_) {
|
||||
if (pool->kernarg()) {
|
||||
system_allocator_ = [pool](size_t size, size_t alignment,
|
||||
MemoryRegion::AllocateFlags alloc_flags) -> void* {
|
||||
MemoryRegion::AllocateFlags alloc_flags, int agent_node_id) -> void* {
|
||||
assert(alignment <= 4096);
|
||||
void* ptr = NULL;
|
||||
return (HSA_STATUS_SUCCESS ==
|
||||
core::Runtime::runtime_singleton_->AllocateMemory(pool, size, alloc_flags,
|
||||
&ptr))
|
||||
&ptr, agent_node_id))
|
||||
? ptr
|
||||
: NULL;
|
||||
};
|
||||
@@ -311,9 +311,9 @@ hsa_status_t Runtime::IterateAgent(hsa_status_t (*callback)(hsa_agent_t agent,
|
||||
|
||||
hsa_status_t Runtime::AllocateMemory(const MemoryRegion* region, size_t size,
|
||||
MemoryRegion::AllocateFlags alloc_flags,
|
||||
void** address) {
|
||||
void** address, int agent_node_id) {
|
||||
size_t size_requested = size; // region->Allocate(...) may align-up size to granularity
|
||||
hsa_status_t status = region->Allocate(size, alloc_flags, address);
|
||||
hsa_status_t status = region->Allocate(size, alloc_flags, address, agent_node_id);
|
||||
// Track the allocation result so that it could be freed properly.
|
||||
if (status == HSA_STATUS_SUCCESS) {
|
||||
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
|
||||
@@ -497,7 +497,7 @@ hsa_status_t Runtime::CopyMemory(void* dst, const void* src, size_t size) {
|
||||
requires the caller to specify all allowed agents we can't assume that a peer mapped pointer
|
||||
would remain mapped for the duration of the copy.
|
||||
*/
|
||||
void* temp = system_allocator_(size, 0, core::MemoryRegion::AllocateNoFlags);
|
||||
void* temp = system_allocator_(size, 0, core::MemoryRegion::AllocateNoFlags, 0);
|
||||
MAKE_SCOPE_GUARD([&]() { system_deallocator_(temp); });
|
||||
hsa_status_t err = src_agent->DmaCopy(temp, source, size);
|
||||
if (err == HSA_STATUS_SUCCESS) err = dst_agent->DmaCopy(dst, temp, size);
|
||||
@@ -3005,7 +3005,7 @@ hsa_status_t Runtime::VMemoryHandleCreate(const MemoryRegion* region, size_t siz
|
||||
|
||||
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
|
||||
void* thunk_handle;
|
||||
hsa_status_t status = region->Allocate(size, alloc_flags, &thunk_handle);
|
||||
hsa_status_t status = region->Allocate(size, alloc_flags, &thunk_handle, 0);
|
||||
if (status == HSA_STATUS_SUCCESS) {
|
||||
memory_handle_map_.emplace(std::piecewise_construct,
|
||||
std::forward_as_tuple(thunk_handle),
|
||||
|
||||
@@ -73,11 +73,11 @@ SharedSignal* SharedSignalPool_t::alloc() {
|
||||
ScopedAcquire<HybridMutex> lock(&lock_);
|
||||
if (free_list_.empty()) {
|
||||
SharedSignal* block = reinterpret_cast<SharedSignal*>(
|
||||
allocate_(block_size_ * sizeof(SharedSignal), __alignof(SharedSignal), 0));
|
||||
allocate_(block_size_ * sizeof(SharedSignal), __alignof(SharedSignal), 0, 0));
|
||||
if (block == nullptr) {
|
||||
block_size_ = minblock_;
|
||||
block = reinterpret_cast<SharedSignal*>(
|
||||
allocate_(block_size_ * sizeof(SharedSignal), __alignof(SharedSignal), 0));
|
||||
allocate_(block_size_ * sizeof(SharedSignal), __alignof(SharedSignal), 0, 0));
|
||||
if (block == nullptr) throw std::bad_alloc();
|
||||
}
|
||||
|
||||
|
||||
Référencer dans un nouveau ticket
Bloquer un utilisateur