Refactor: Consolidate calls to hsaKmtAllocMemory

Route all device-visible system memory allocations through system_allocator.

Change-Id: I5e90a1bf491e432678a6d8ab1f9f3770734cbda1


[ROCm/ROCR-Runtime commit: 74f5aca93d]
Tento commit je obsažen v:
Jay Cornwall
2016-08-22 20:19:21 -05:00
rodič 7e2179da7b
revize d5ecfae62f
17 změnil soubory, kde provedl 89 přidání a 153 odebrání
+2 -2
Zobrazit soubor
@@ -43,6 +43,6 @@
#include "core/common/shared.h"
namespace core {
std::function<void*(size_t, size_t)> BaseShared::allocate_=nullptr;
std::function<void(void*)> BaseShared::free_=nullptr;
std::function<void*(size_t, size_t, uint32_t)> BaseShared::allocate_ = nullptr;
std::function<void(void*)> BaseShared::free_ = nullptr;
}
+3 -3
Zobrazit soubor
@@ -55,14 +55,14 @@ namespace core {
class BaseShared {
public:
static void SetAllocateAndFree(
const std::function<void*(size_t, size_t)>& allocate,
const std::function<void*(size_t, size_t, uint32_t)>& allocate,
const std::function<void(void*)>& free) {
allocate_ = allocate;
free_ = free;
}
protected:
static std::function<void*(size_t, size_t)> allocate_;
static std::function<void*(size_t, size_t, uint32_t)> allocate_;
static std::function<void(void*)> free_;
};
@@ -78,7 +78,7 @@ class Shared : public BaseShared {
"Align is less than alignof(T)");
shared_object_ =
reinterpret_cast<T*>(allocate_(sizeof(T), Max(__alignof(T), Align)));
reinterpret_cast<T*>(allocate_(sizeof(T), Max(__alignof(T), Align), 0));
assert(shared_object_ != NULL && "Failed on allocating shared_object_");
-1
Zobrazit soubor
@@ -49,7 +49,6 @@
#include <vector>
#include "core/inc/runtime.h"
#include "core/inc/checked.h"
#include "core/inc/isa.h"
#include "core/inc/queue.h"
@@ -44,6 +44,7 @@
#define HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_H_
#include <map>
#include <mutex>
#include <stdint.h>
#include "core/inc/blit.h"
+1 -3
Zobrazit soubor
@@ -99,9 +99,7 @@ class MemoryRegion : public core::MemoryRegion {
~MemoryRegion();
hsa_status_t Allocate(size_t size, void** address) const;
hsa_status_t Allocate(bool restrict_access, size_t size,
hsa_status_t Allocate(size_t size, AllocateFlags alloc_flags,
void** address) const;
hsa_status_t Free(void* address, size_t size) const;
+2 -1
Zobrazit soubor
@@ -43,7 +43,8 @@
#ifndef HSA_RUNTME_CORE_INC_CHECKED_H_
#define HSA_RUNTME_CORE_INC_CHECKED_H_
#include "stdint.h"
#include <stdint.h>
#include <stdlib.h>
namespace core {
+11 -2
Zobrazit soubor
@@ -47,7 +47,6 @@
#include <vector>
#include "core/inc/runtime.h"
#include "core/inc/agent.h"
#include "core/inc/checked.h"
@@ -81,7 +80,17 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> {
return reinterpret_cast<MemoryRegion*>(region.handle);
}
virtual hsa_status_t Allocate(size_t size, void** address) const = 0;
enum AllocateEnum {
AllocateNoFlags = 0,
AllocateRestrict = (1 << 0), // Don't map system memory to GPU agents
AllocateExecutable = (1 << 1), // Set executable permission
AllocateDoubleMap = (1 << 2), // Map twice VA allocation to backing store
};
typedef uint32_t AllocateFlags;
virtual hsa_status_t Allocate(size_t size, AllocateFlags alloc_flags,
void** address) const = 0;
virtual hsa_status_t Free(void* address, size_t size) const = 0;
-1
Zobrazit soubor
@@ -48,7 +48,6 @@
#include "core/common/shared.h"
#include "core/inc/runtime.h"
#include "core/inc/checked.h"
#include "core/util/utils.h"
+6 -15
Zobrazit soubor
@@ -151,25 +151,14 @@ class Runtime {
///
/// @param [in] region Pointer to region object.
/// @param [in] size Allocation size in bytes.
/// @param [in] alloc_flags Modifiers to pass to MemoryRegion allocator.
/// @param [out] address Pointer to store the allocation result.
///
/// @retval ::HSA_STATUS_SUCCESS If allocation is successful.
hsa_status_t AllocateMemory(const MemoryRegion* region, size_t size,
MemoryRegion::AllocateFlags alloc_flags,
void** address);
/// @brief Allocate memory on a particular region with option to restrict
/// access to the owning agent.
///
/// @param [in] restrict_access If true, the allocation result would only be
/// accessible to the agent(s) that own the region object.
/// @param [in] region Pointer to region object.
/// @param [in] size Allocation size in bytes.
/// @param [out] address Pointer to store the allocation result.
///
/// @retval ::HSA_STATUS_SUCCESS If allocation is successful.
hsa_status_t AllocateMemory(bool restrict_access, const MemoryRegion* region,
size_t size, void** address);
/// @brief Free memory previously allocated with AllocateMemory.
///
/// @param [in] ptr Address of the memory to be freed.
@@ -292,7 +281,8 @@ class Runtime {
amd::hsa::code::AmdHsaCodeManager* code_manager() { return &code_manager_; }
std::function<void*(size_t, size_t)>& system_allocator() {
std::function<void*(size_t, size_t, MemoryRegion::AllocateFlags)>&
system_allocator() {
return system_allocator_;
}
@@ -446,7 +436,8 @@ class Runtime {
std::map<const void*, AllocationRegion> allocation_map_;
// Allocator using ::system_region_
std::function<void*(size_t, size_t)> system_allocator_;
std::function<void*(size_t, size_t, MemoryRegion::AllocateFlags)>
system_allocator_;
// Deallocator using ::system_region_
std::function<void(void*)> system_deallocator_;
+14 -40
Zobrazit soubor
@@ -266,21 +266,12 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id,
SignalGuard.Dismiss();
#endif
HsaMemFlags pm4_ib_buf_flags = {0};
pm4_ib_buf_flags.ui32.HostAccess = 1;
pm4_ib_buf_flags.ui32.ExecuteAccess = 1;
pm4_ib_buf_flags.ui32.NoSubstitute = 1;
HSAKMT_STATUS err =
hsaKmtAllocMemory(agent_->node_id(), pm4_ib_size_b_, pm4_ib_buf_flags, &pm4_ib_buf_);
assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtAllocMemory(PM4 IB) failed");
err = hsaKmtMapMemoryToGPU(pm4_ib_buf_, pm4_ib_size_b_, NULL);
assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtMapMemoryToGPU(PM4 IB) failed");
pm4_ib_buf_ = core::Runtime::runtime_singleton_->system_allocator()(
pm4_ib_size_b_, 0x1000, core::MemoryRegion::AllocateExecutable);
if (pm4_ib_buf_ == NULL) return;
MAKE_NAMED_SCOPE_GUARD(PM4IBGuard, [&]() {
hsaKmtUnmapMemoryToGPU(pm4_ib_buf_);
hsaKmtFreeMemory(pm4_ib_buf_, pm4_ib_size_b_);
core::Runtime::runtime_singleton_->system_deallocator()(pm4_ib_buf_);
});
valid_ = true;
@@ -314,8 +305,7 @@ AqlQueue::~AqlQueue() {
}
#endif
hsaKmtUnmapMemoryToGPU(pm4_ib_buf_);
hsaKmtFreeMemory(pm4_ib_buf_, pm4_ib_size_b_);
core::Runtime::runtime_singleton_->system_deallocator()(pm4_ib_buf_);
}
uint64_t AqlQueue::LoadReadIndexAcquire() {
@@ -631,34 +621,19 @@ void AqlQueue::AllocRegisteredRingBuffer(uint32_t queue_size_pkts) {
#endif
} else {
// Allocate storage for the ring buffer.
HsaMemFlags flags;
flags.Value = 0;
flags.ui32.HostAccess = 1;
flags.ui32.AtomicAccessPartial = 1;
flags.ui32.ExecuteAccess = 1;
flags.ui32.AQLQueueMemory = 1;
ring_buf_alloc_bytes_ = AlignUp(
queue_size_pkts * static_cast<uint32_t>(sizeof(core::AqlPacket)), 4096);
auto err = hsaKmtAllocMemory(agent_->node_id(), ring_buf_alloc_bytes_,
flags, (void**)&ring_buf_);
if (err != HSAKMT_STATUS_SUCCESS) {
assert(false && "AQL queue memory allocation failure.");
return;
}
ring_buf_ = core::Runtime::runtime_singleton_->system_allocator()(
ring_buf_alloc_bytes_, 0x1000,
core::MemoryRegion::AllocateExecutable |
core::MemoryRegion::AllocateDoubleMap);
HSAuint64 alternate_va;
err = hsaKmtMapMemoryToGPU(ring_buf_, ring_buf_alloc_bytes_, &alternate_va);
assert(ring_buf_ != NULL && "AQL queue memory allocation failure");
if (err != HSAKMT_STATUS_SUCCESS) {
assert(false && "AQL queue memory map failure.");
hsaKmtFreeMemory(ring_buf_, ring_buf_alloc_bytes_);
ring_buf_ = NULL;
return;
}
ring_buf_alloc_bytes_ = 2 * ring_buf_alloc_bytes_;
// The virtual ring allocation is twice as large as requested.
// Each half maps to the same set of physical pages.
ring_buf_alloc_bytes_ *= 2;
}
}
@@ -673,8 +648,7 @@ void AqlQueue::FreeRegisteredRingBuffer() {
(void*)(uintptr_t(ring_buf_) + (ring_buf_alloc_bytes_ / 2)));
#endif
} else {
hsaKmtUnmapMemoryToGPU(ring_buf_);
hsaKmtFreeMemory(ring_buf_, ring_buf_alloc_bytes_ / 2);
core::Runtime::runtime_singleton_->system_deallocator()(ring_buf_);
}
ring_buf_ = NULL;
@@ -537,7 +537,8 @@ hsa_status_t BlitKernel::Initialize(const core::Agent& agent) {
kernarg_async_ = reinterpret_cast<KernelArgs*>(
core::Runtime::runtime_singleton_->system_allocator()(
queue_->public_handle()->size * AlignUp(sizeof(KernelArgs), 16), 16));
queue_->public_handle()->size * AlignUp(sizeof(KernelArgs), 16), 16,
core::MemoryRegion::AllocateNoFlags));
kernarg_async_mask_ = queue_->public_handle()->size - 1;
@@ -447,26 +447,11 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) {
// Allocate queue buffer.
queue_size_ = kQueueSize;
HsaMemFlags flags;
flags.Value = 0;
flags.ui32.HostAccess = 1;
flags.ui32.AtomicAccessPartial = 1;
flags.ui32.ExecuteAccess = 1;
queue_start_addr_ =
(char*)core::Runtime::runtime_singleton_->system_allocator()(
queue_size_, 0x1000, core::MemoryRegion::AllocateExecutable);
auto err = hsaKmtAllocMemory(amd_gpu_agent.node_id(), queue_size_, flags,
reinterpret_cast<void**>(&queue_start_addr_));
if (err != HSAKMT_STATUS_SUCCESS) {
assert(false && "SDMA queue memory allocation failure.");
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
HSAuint64 alternate_va;
err = hsaKmtMapMemoryToGPU(queue_start_addr_, queue_size_, &alternate_va);
if (err != HSAKMT_STATUS_SUCCESS) {
assert(false && "AQL queue memory map failure.");
Destroy(agent);
if (queue_start_addr_ == NULL) {
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
@@ -494,7 +479,8 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) {
fence_base_addr_ = reinterpret_cast<uint32_t*>(
core::Runtime::runtime_singleton_->system_allocator()(
fence_pool_size_ * sizeof(uint32_t), 256));
fence_pool_size_ * sizeof(uint32_t), 256,
core::MemoryRegion::AllocateNoFlags));
if (fence_base_addr_ == NULL) {
Destroy(agent);
@@ -516,8 +502,7 @@ hsa_status_t BlitSdma::Destroy(const core::Agent& agent) {
if (queue_start_addr_ != NULL && queue_size_ != 0) {
// Release queue buffer.
hsaKmtUnmapMemoryToGPU(queue_start_addr_);
hsaKmtFreeMemory(queue_start_addr_, queue_size_);
core::Runtime::runtime_singleton_->system_deallocator()(queue_start_addr_);
}
if (fence_base_addr_ != NULL) {
+10 -20
Zobrazit soubor
@@ -124,14 +124,8 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
// Populate region list.
InitRegionList();
// Reserve memory for scratch.
InitScratchPool();
// Populate cache list.
InitCacheList();
// Bind the second-level trap handler to this node.
BindTrapHandler();
}
GpuAgent::~GpuAgent() {
@@ -214,21 +208,13 @@ void GpuAgent::AssembleShader(const char* src_sp3, const char* func_name,
}
// Allocate a GPU-visible buffer for the shader.
HsaMemFlags code_buf_flags = {0};
code_buf_flags.ui32.HostAccess = 1;
code_buf_flags.ui32.ExecuteAccess = 1;
code_buf_flags.ui32.NoSubstitute = 1;
size_t header_size =
(assemble_target == AssembleTarget::AQL ? sizeof(amd_kernel_code_t) : 0);
code_buf_size = AlignUp(header_size + asic_shader->size, 0x1000);
HSAKMT_STATUS err =
hsaKmtAllocMemory(node_id(), code_buf_size, code_buf_flags, &code_buf);
assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtAllocMemory(Trap) failed");
err = hsaKmtMapMemoryToGPU(code_buf, code_buf_size, NULL);
assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtMapMemoryToGPU(Trap) failed");
code_buf = core::Runtime::runtime_singleton_->system_allocator()(
code_buf_size, 0x1000, core::MemoryRegion::AllocateExecutable);
assert(code_buf != NULL && "Code buffer allocation failed");
memset(code_buf, 0, code_buf_size);
@@ -265,8 +251,7 @@ void GpuAgent::AssembleShader(const char* src_sp3, const char* func_name,
}
void GpuAgent::ReleaseShader(void* code_buf, size_t code_buf_size) const {
hsaKmtUnmapMemoryToGPU(code_buf);
hsaKmtFreeMemory(code_buf, code_buf_size);
core::Runtime::runtime_singleton_->system_deallocator()(code_buf);
}
void GpuAgent::InitRegionList() {
@@ -415,7 +400,8 @@ bool GpuAgent::InitEndTsPool() {
uint64_t* buff = NULL;
if (HSA_STATUS_SUCCESS !=
runtime->AllocateMemory(true, local_region_, alloc_size,
runtime->AllocateMemory(local_region_, alloc_size,
MemoryRegion::AllocateRestrict,
reinterpret_cast<void**>(&buff))) {
return false;
}
@@ -589,6 +575,10 @@ void GpuAgent::InitDma() {
}
hsa_status_t GpuAgent::PostToolsInit() {
// Defer memory allocation until agents have been discovered.
InitScratchPool();
BindTrapHandler();
// Defer utility queue creation to allow tools to intercept.
queues_[QueueUtility] = CreateInterceptibleQueue();
@@ -149,11 +149,7 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool full_profile,
MemoryRegion::~MemoryRegion() {}
hsa_status_t MemoryRegion::Allocate(size_t size, void** address) const {
return Allocate(false, size, address);
}
hsa_status_t MemoryRegion::Allocate(bool restrict_access, size_t size,
hsa_status_t MemoryRegion::Allocate(size_t size, AllocateFlags alloc_flags,
void** address) const {
if (address == NULL) {
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
@@ -169,7 +165,13 @@ hsa_status_t MemoryRegion::Allocate(bool restrict_access, size_t size,
size = AlignUp(size, kPageSize_);
*address = AllocateKfdMemory(mem_flag_, owner()->node_id(), size);
HsaMemFlags kmt_alloc_flags(mem_flag_);
kmt_alloc_flags.ui32.ExecuteAccess =
(alloc_flags & AllocateExecutable ? 1 : 0);
kmt_alloc_flags.ui32.AQLQueueMemory =
(alloc_flags & AllocateDoubleMap ? 1 : 0);
*address = AllocateKfdMemory(kmt_alloc_flags, owner()->node_id(), size);
if (*address != NULL) {
// Commit the memory.
@@ -184,7 +186,7 @@ hsa_status_t MemoryRegion::Allocate(bool restrict_access, size_t size,
const uint32_t* map_node_id = &owner_node_id;
if (IsSystem()) {
if (!restrict_access) {
if ((alloc_flags & AllocateRestrict) == 0) {
// Map to all GPU agents.
map_node_count = core::Runtime::runtime_singleton_->gpu_ids().size();
+2 -2
Zobrazit soubor
@@ -903,8 +903,8 @@ hsa_status_t
const core::MemoryRegion* mem_region = core::MemoryRegion::Convert(region);
IS_VALID(mem_region);
return core::Runtime::runtime_singleton_->AllocateMemory(mem_region, size,
ptr);
return core::Runtime::runtime_singleton_->AllocateMemory(
mem_region, size, core::MemoryRegion::AllocateNoFlags, ptr);
}
hsa_status_t hsa_memory_free(void* ptr) {
+2 -2
Zobrazit soubor
@@ -464,8 +464,8 @@ hsa_status_t
return (hsa_status_t)HSA_STATUS_ERROR_INVALID_MEMORY_POOL;
}
return core::Runtime::runtime_singleton_->AllocateMemory(true, mem_region,
size, ptr);
return core::Runtime::runtime_singleton_->AllocateMemory(
mem_region, size, core::MemoryRegion::AllocateRestrict, ptr);
}
hsa_status_t hsa_amd_memory_pool_free(void* ptr) {
+17 -31
Zobrazit soubor
@@ -159,29 +159,22 @@ void Runtime::RegisterAgent(Agent* agent) {
// Init default fine grain system region allocator using fine grain
// system region of the first discovered CPU agent.
if (cpu_agents_.size() == 1) {
if (system_regions_fine_[0]->full_profile()) {
system_allocator_ = [](size_t size, size_t alignment) -> void * {
return _aligned_malloc(size, alignment);
};
// Might need memory pooling to cover allocation that
// requires less than 4096 bytes.
system_allocator_ =
[&](size_t size, size_t alignment,
MemoryRegion::AllocateFlags alloc_flags) -> void* {
assert(alignment <= 4096);
void* ptr = NULL;
return (HSA_STATUS_SUCCESS ==
core::Runtime::runtime_singleton_->AllocateMemory(
system_regions_fine_[0], size, alloc_flags, &ptr))
? ptr
: NULL;
};
system_deallocator_ = [](void* ptr) { _aligned_free(ptr); };
} else {
// Might need memory pooling to cover allocation that
// requires less than 4096 bytes.
system_allocator_ = [&](size_t size, size_t alignment) -> void * {
assert(alignment <= 4096);
void* ptr = NULL;
return (HSA_STATUS_SUCCESS ==
core::Runtime::runtime_singleton_->AllocateMemory(
system_regions_fine_[0], size, &ptr))
? ptr
: NULL;
};
system_deallocator_ = [](void* ptr) {
core::Runtime::runtime_singleton_->FreeMemory(ptr);
};
}
system_deallocator_ =
[](void* ptr) { core::Runtime::runtime_singleton_->FreeMemory(ptr); };
BaseShared::SetAllocateAndFree(system_allocator_, system_deallocator_);
}
@@ -307,16 +300,9 @@ hsa_status_t Runtime::IterateAgent(hsa_status_t (*callback)(hsa_agent_t agent,
}
hsa_status_t Runtime::AllocateMemory(const MemoryRegion* region, size_t size,
void** ptr) {
return AllocateMemory(false, region, size, ptr);
}
hsa_status_t Runtime::AllocateMemory(bool restrict_access,
const MemoryRegion* region, size_t size,
MemoryRegion::AllocateFlags alloc_flags,
void** address) {
const amd::MemoryRegion* amd_region =
reinterpret_cast<const amd::MemoryRegion*>(region);
hsa_status_t status = amd_region->Allocate(restrict_access, size, address);
hsa_status_t status = region->Allocate(size, alloc_flags, address);
// Track the allocation result so that it could be freed properly.
if (status == HSA_STATUS_SUCCESS) {