Temporary: Set AllocateGTTAccess and node_id for MES

Temporary change to set the AllocateGTTAccess flag and node_id
on MES devices.

Change-Id: I22385d11b17b76cfb44278fa0d8a09bc8721cea6
Cette révision appartient à :
David Yat Sin
2024-03-07 23:54:17 +00:00
Parent 9e8f185397
révision efe455c2fa
10 fichiers modifiés avec 69 ajouts et 28 suppressions
+1 -1
Voir le fichier
@@ -44,7 +44,7 @@
namespace rocr {
namespace core {
std::function<void*(size_t, size_t, uint32_t)> BaseShared::allocate_ = nullptr;
std::function<void*(size_t, size_t, uint32_t, int)> BaseShared::allocate_ = nullptr;
std::function<void(void*)> BaseShared::free_ = nullptr;
} // namespace core
} // namespace rocr
+32 -4
Voir le fichier
@@ -58,14 +58,14 @@ namespace core {
class BaseShared {
public:
static void SetAllocateAndFree(
const std::function<void*(size_t, size_t, uint32_t)>& allocate,
const std::function<void*(size_t, size_t, uint32_t, int)>& allocate,
const std::function<void(void*)>& free) {
allocate_ = allocate;
free_ = free;
}
protected:
static std::function<void*(size_t, size_t, uint32_t)> allocate_;
static std::function<void*(size_t, size_t, uint32_t, int)> allocate_;
static std::function<void(void*)> free_;
};
@@ -73,7 +73,19 @@ class BaseShared {
template <typename T> class PageAllocator : private BaseShared {
public:
__forceinline static T* alloc(int flags = 0) {
T* ret = reinterpret_cast<T*>(allocate_(AlignUp(sizeof(T), 4096), 4096, flags));
T* ret = reinterpret_cast<T*>(allocate_(AlignUp(sizeof(T), 4096), 4096, flags, 0));
if (ret == nullptr) throw std::bad_alloc();
MAKE_NAMED_SCOPE_GUARD(throwGuard, [&]() { free_(ret); });
new (ret) T;
throwGuard.Dismiss();
return ret;
}
__forceinline static T* alloc(int agent_node_id, int flags) {
T* ret = reinterpret_cast<T*>(allocate_(AlignUp(sizeof(T), 4096), 4096, flags, agent_node_id));
if (ret == nullptr) throw std::bad_alloc();
MAKE_NAMED_SCOPE_GUARD(throwGuard, [&]() { free_(ret); });
@@ -107,6 +119,16 @@ class Shared final : private BaseShared {
shared_object_ = PageAllocator<T>::alloc(flags);
}
explicit Shared(int agent_node_id, Allocator* pool = nullptr, int flags = 0) : pool_(pool) {
assert(allocate_ != nullptr && free_ != nullptr &&
"Shared object allocator is not set");
if (pool_)
shared_object_ = pool_->alloc();
else
shared_object_ = PageAllocator<T>::alloc(agent_node_id, flags);
}
~Shared() {
assert(allocate_ != nullptr && free_ != nullptr && "Shared object allocator is not set");
@@ -147,6 +169,12 @@ template <typename T> class Shared<T, PageAllocator<T>> final : private BaseShar
shared_object_ = PageAllocator<T>::alloc(flags);
}
Shared(int agent_node_id, int flags) {
assert(allocate_ != nullptr && free_ != nullptr && "Shared object allocator is not set");
shared_object_ = PageAllocator<T>::alloc(agent_node_id, flags);
}
~Shared() {
assert(allocate_ != nullptr && free_ != nullptr &&
"Shared object allocator is not set");
@@ -183,7 +211,7 @@ template <typename T, size_t Align> class SharedArray final : private BaseShared
static_assert((__alignof(T) <= Align) || (Align == 0), "Align is less than alignof(T)");
shared_object_ =
reinterpret_cast<T*>(allocate_(sizeof(T) * length, Max(__alignof(T), Align), 0));
reinterpret_cast<T*>(allocate_(sizeof(T) * length, Max(__alignof(T), Align), 0, 0));
if (shared_object_ == nullptr) throw std::bad_alloc();
size_t i = 0;
+2 -2
Voir le fichier
@@ -100,7 +100,7 @@ class MemoryRegion : public core::MemoryRegion {
~MemoryRegion();
hsa_status_t Allocate(size_t& size, AllocateFlags alloc_flags, void** address) const;
hsa_status_t Allocate(size_t& size, AllocateFlags alloc_flags, void** address, int agent_node_id = 0) const;
hsa_status_t Free(void* address, size_t size) const;
@@ -200,7 +200,7 @@ class MemoryRegion : public core::MemoryRegion {
const core::Runtime::LinkInfo& link_info) const;
// Operational body for Allocate. Recursive.
hsa_status_t AllocateImpl(size_t& size, AllocateFlags alloc_flags, void** address) const;
hsa_status_t AllocateImpl(size_t& size, AllocateFlags alloc_flags, void** address, int agent_node_id) const;
// Operational body for Free. Recursive.
hsa_status_t FreeImpl(void* address, size_t size) const;
+5 -1
Voir le fichier
@@ -99,11 +99,15 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> {
AllocateAsan = (1 << 6), // ASAN - First page of allocation remapped to system memory
AllocatePinned = (1 << 7), // Currently treating Pinned memory as NoSubstitute
AllocateMemoryOnly = (1 << 8), // Memory only handle from thunk, no virtual address
// Flag to allocate system memory with GTT Access
// Note: The node_id needs to be the node_id of the device even though this is allocating
// system memory
AllocateGTTAccess = (1 << 9),
};
typedef uint32_t AllocateFlags;
virtual hsa_status_t Allocate(size_t& size, AllocateFlags alloc_flags, void** address) const = 0;
virtual hsa_status_t Allocate(size_t& size, AllocateFlags alloc_flags, void** address, int agent_node_id) const = 0;
virtual hsa_status_t Free(void* address, size_t size) const = 0;
+6
Voir le fichier
@@ -162,6 +162,7 @@ struct SharedQueue {
class LocalQueue {
public:
LocalQueue(int mem_flags) : local_queue_(mem_flags) {}
LocalQueue(int agent_node_id, int mem_flags) : local_queue_(agent_node_id, mem_flags) {}
SharedQueue* queue() const { return local_queue_.shared_object(); }
private:
@@ -183,6 +184,11 @@ class Queue : public Checked<0xFA3906A679F9DB49>, private LocalQueue {
public_handle_ = Convert(this);
}
Queue(int agent_node_id, int mem_flags = 0) : LocalQueue(agent_node_id, mem_flags), amd_queue_(queue()->amd_queue) {
queue()->core_queue = this;
public_handle_ = Convert(this);
}
virtual ~Queue() {}
virtual void Destroy() { delete this; }
+3 -3
Voir le fichier
@@ -198,7 +198,7 @@ class Runtime {
/// @retval ::HSA_STATUS_SUCCESS If allocation is successful.
hsa_status_t AllocateMemory(const MemoryRegion* region, size_t size,
MemoryRegion::AllocateFlags alloc_flags,
void** address);
void** address, int agent_node_id = 0);
/// @brief Free memory previously allocated with AllocateMemory.
///
@@ -419,7 +419,7 @@ class Runtime {
amd::hsa::code::AmdHsaCodeManager* code_manager() { return &code_manager_; }
std::function<void*(size_t size, size_t align, MemoryRegion::AllocateFlags flags)>&
std::function<void*(size_t size, size_t align, MemoryRegion::AllocateFlags flags, int agent_node_id)>&
system_allocator() {
return system_allocator_;
}
@@ -659,7 +659,7 @@ class Runtime {
prefetch_map_t prefetch_map_;
// Allocator using ::system_region_
std::function<void*(size_t size, size_t align, MemoryRegion::AllocateFlags flags)> system_allocator_;
std::function<void*(size_t size, size_t align, MemoryRegion::AllocateFlags flags, int agent_node_id)> system_allocator_;
// Deallocator using ::system_region_
std::function<void(void*)> system_deallocator_;
+1 -1
Voir le fichier
@@ -80,7 +80,7 @@ int AqlQueue::rtti_id_ = 0;
AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, ScratchInfo& scratch,
core::HsaEventCallback callback, void* err_data, bool is_kv)
: Queue(agent->isMES() ? MemoryRegion::AllocateNonPaged : 0),
: Queue(agent->node_id(), agent->isMES() ? (MemoryRegion::AllocateGTTAccess | MemoryRegion::AllocateNonPaged) : 0),
LocalSignal(0, false),
DoorbellSignal(signal()),
ring_buf_(nullptr),
+11 -8
Voir le fichier
@@ -59,8 +59,7 @@ namespace AMD {
size_t MemoryRegion::max_sysmem_alloc_size_ = 0;
size_t MemoryRegion::kPageSize_ = sysconf(_SC_PAGESIZE);
void* MemoryRegion::AllocateKfdMemory(const HsaMemFlags& flag,
HSAuint32 node_id, size_t size) {
void* MemoryRegion::AllocateKfdMemory(const HsaMemFlags& flag, HSAuint32 node_id, size_t size) {
void* ret = NULL;
const HSAKMT_STATUS status = hsaKmtAllocMemory(node_id, size, flag, &ret);
return (status == HSAKMT_STATUS_SUCCESS) ? ret : NULL;
@@ -170,13 +169,13 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile,
MemoryRegion::~MemoryRegion() {}
hsa_status_t MemoryRegion::Allocate(size_t& size, AllocateFlags alloc_flags, void** address) const {
hsa_status_t MemoryRegion::Allocate(size_t& size, AllocateFlags alloc_flags, void** address, int agent_node_id) const {
ScopedAcquire<KernelMutex> lock(&owner()->agent_memory_lock_);
return AllocateImpl(size, alloc_flags, address);
return AllocateImpl(size, alloc_flags, address, agent_node_id);
}
hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags,
void** address) const {
void** address, int agent_node_id) const {
if (address == NULL) {
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
@@ -209,6 +208,8 @@ hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags,
kmt_alloc_flags.ui32.CoarseGrain = (alloc_flags & AllocatePCIeRW ? 0 : kmt_alloc_flags.ui32.CoarseGrain);
kmt_alloc_flags.ui32.NoSubstitute = (alloc_flags & AllocatePinned ? 1 : kmt_alloc_flags.ui32.NoSubstitute);
kmt_alloc_flags.ui32.GTTAccess = (alloc_flags & AllocateGTTAccess ? 1 : kmt_alloc_flags.ui32.GTTAccess);
// Only allow using the suballocator for ordinary VRAM.
if (IsLocalMemory() && !kmt_alloc_flags.ui32.NoAddress) {
bool subAllocEnabled = !core::Runtime::runtime_singleton_->flag().disable_fragment_alloc();
@@ -228,12 +229,14 @@ hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags,
}
}
const HSAuint32 node_id = (alloc_flags & AllocateGTTAccess) ? agent_node_id : owner()->node_id();
// Allocate memory.
// If it fails attempt to release memory from the block allocator and retry.
*address = AllocateKfdMemory(kmt_alloc_flags, owner()->node_id(), size);
*address = AllocateKfdMemory(kmt_alloc_flags, node_id, size);
if (*address == nullptr) {
owner()->Trim();
*address = AllocateKfdMemory(kmt_alloc_flags, owner()->node_id(), size);
*address = AllocateKfdMemory(kmt_alloc_flags, node_id, size);
}
if (kmt_alloc_flags.ui32.NoAddress) return HSA_STATUS_SUCCESS;
@@ -768,7 +771,7 @@ void* MemoryRegion::BlockAllocator::alloc(size_t request_size, size_t& allocated
size_t bsize = AlignUp(request_size, block_size());
hsa_status_t err = region_.AllocateImpl(
bsize, core::MemoryRegion::AllocateRestrict | core::MemoryRegion::AllocateDirect, &ret);
bsize, core::MemoryRegion::AllocateRestrict | core::MemoryRegion::AllocateDirect, &ret, 0);
if (err != HSA_STATUS_SUCCESS)
throw AMD::hsa_exception(err, "MemoryRegion::BlockAllocator::alloc failed.");
assert(ret != nullptr && "Region returned nullptr on success.");
+6 -6
Voir le fichier
@@ -208,12 +208,12 @@ void Runtime::RegisterAgent(Agent* agent, bool Enabled) {
for (auto pool : system_regions_fine_) {
if (pool->kernarg()) {
system_allocator_ = [pool](size_t size, size_t alignment,
MemoryRegion::AllocateFlags alloc_flags) -> void* {
MemoryRegion::AllocateFlags alloc_flags, int agent_node_id) -> void* {
assert(alignment <= 4096);
void* ptr = NULL;
return (HSA_STATUS_SUCCESS ==
core::Runtime::runtime_singleton_->AllocateMemory(pool, size, alloc_flags,
&ptr))
&ptr, agent_node_id))
? ptr
: NULL;
};
@@ -311,9 +311,9 @@ hsa_status_t Runtime::IterateAgent(hsa_status_t (*callback)(hsa_agent_t agent,
hsa_status_t Runtime::AllocateMemory(const MemoryRegion* region, size_t size,
MemoryRegion::AllocateFlags alloc_flags,
void** address) {
void** address, int agent_node_id) {
size_t size_requested = size; // region->Allocate(...) may align-up size to granularity
hsa_status_t status = region->Allocate(size, alloc_flags, address);
hsa_status_t status = region->Allocate(size, alloc_flags, address, agent_node_id);
// Track the allocation result so that it could be freed properly.
if (status == HSA_STATUS_SUCCESS) {
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
@@ -497,7 +497,7 @@ hsa_status_t Runtime::CopyMemory(void* dst, const void* src, size_t size) {
requires the caller to specify all allowed agents we can't assume that a peer mapped pointer
would remain mapped for the duration of the copy.
*/
void* temp = system_allocator_(size, 0, core::MemoryRegion::AllocateNoFlags);
void* temp = system_allocator_(size, 0, core::MemoryRegion::AllocateNoFlags, 0);
MAKE_SCOPE_GUARD([&]() { system_deallocator_(temp); });
hsa_status_t err = src_agent->DmaCopy(temp, source, size);
if (err == HSA_STATUS_SUCCESS) err = dst_agent->DmaCopy(dst, temp, size);
@@ -3005,7 +3005,7 @@ hsa_status_t Runtime::VMemoryHandleCreate(const MemoryRegion* region, size_t siz
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
void* thunk_handle;
hsa_status_t status = region->Allocate(size, alloc_flags, &thunk_handle);
hsa_status_t status = region->Allocate(size, alloc_flags, &thunk_handle, 0);
if (status == HSA_STATUS_SUCCESS) {
memory_handle_map_.emplace(std::piecewise_construct,
std::forward_as_tuple(thunk_handle),
+2 -2
Voir le fichier
@@ -73,11 +73,11 @@ SharedSignal* SharedSignalPool_t::alloc() {
ScopedAcquire<HybridMutex> lock(&lock_);
if (free_list_.empty()) {
SharedSignal* block = reinterpret_cast<SharedSignal*>(
allocate_(block_size_ * sizeof(SharedSignal), __alignof(SharedSignal), 0));
allocate_(block_size_ * sizeof(SharedSignal), __alignof(SharedSignal), 0, 0));
if (block == nullptr) {
block_size_ = minblock_;
block = reinterpret_cast<SharedSignal*>(
allocate_(block_size_ * sizeof(SharedSignal), __alignof(SharedSignal), 0));
allocate_(block_size_ * sizeof(SharedSignal), __alignof(SharedSignal), 0, 0));
if (block == nullptr) throw std::bad_alloc();
}