rocr: Generalize AMD::MemoryRegion Allocate and Free

Remove KFD-specific Allocate/Free calls from the AMD::MemoryRegion.
The KFD-driver-specific Allocate/Free calls are now implemented in
the KfdDriver. Future changes will migrate the remaining KFD-specific
calls out of AMD::MemoryRegion.

This allows the MemoryRegion to be used across AMD drivers like the
XDNA driver.

Change-Id: Ib6a2a9e5e1a15e61644d2592beb3a8e6578c3010
Этот коммит содержится в:
Tony Gutierrez
2024-08-19 15:46:36 +00:00
родитель c42ff44a6a
Коммит 68669f4e1a
11 изменённых файлов: 420 добавлений и 298 удалений
+188 -7
Просмотреть файл
@@ -49,6 +49,10 @@
#include "hsakmt/hsakmt.h"
#include "core/inc/amd_cpu_agent.h"
#include "core/inc/amd_gpu_agent.h"
#include "core/inc/amd_memory_region.h"
#include "core/inc/exceptions.h"
#include "core/inc/runtime.h"
namespace rocr {
@@ -70,18 +74,155 @@ hsa_status_t KfdDriver::QueryKernelModeDriver(core::DriverQuery query) {
return HSA_STATUS_SUCCESS;
}
hsa_status_t KfdDriver::GetMemoryProperties(uint32_t node_id,
core::MemProperties &mprops) const {
hsa_status_t
KfdDriver::GetMemoryProperties(uint32_t node_id,
core::MemoryRegion &mem_region) const {
return HSA_STATUS_SUCCESS;
}
hsa_status_t KfdDriver::AllocateMemory(void **mem, size_t size,
uint32_t node_id, core::MemFlags flags) {
return HSA_STATUS_SUCCESS;
hsa_status_t
KfdDriver::AllocateMemory(const core::MemoryRegion &mem_region,
core::MemoryRegion::AllocateFlags alloc_flags,
void **mem, size_t size, uint32_t agent_node_id) {
const MemoryRegion &m_region(static_cast<const MemoryRegion &>(mem_region));
HsaMemFlags kmt_alloc_flags(m_region.mem_flags());
kmt_alloc_flags.ui32.ExecuteAccess =
(alloc_flags & core::MemoryRegion::AllocateExecutable ? 1 : 0);
kmt_alloc_flags.ui32.AQLQueueMemory =
(alloc_flags & core::MemoryRegion::AllocateDoubleMap ? 1 : 0);
if (m_region.IsSystem() &&
(alloc_flags & core::MemoryRegion::AllocateNonPaged)) {
kmt_alloc_flags.ui32.NonPaged = 1;
}
// Allocating a memory handle for virtual memory
kmt_alloc_flags.ui32.NoAddress =
!!(alloc_flags & core::MemoryRegion::AllocateMemoryOnly);
// Allocate pseudo fine grain memory
kmt_alloc_flags.ui32.CoarseGrain =
(alloc_flags & core::MemoryRegion::AllocatePCIeRW
? 0
: kmt_alloc_flags.ui32.CoarseGrain);
kmt_alloc_flags.ui32.NoSubstitute =
(alloc_flags & core::MemoryRegion::AllocatePinned
? 1
: kmt_alloc_flags.ui32.NoSubstitute);
kmt_alloc_flags.ui32.GTTAccess =
(alloc_flags & core::MemoryRegion::AllocateGTTAccess
? 1
: kmt_alloc_flags.ui32.GTTAccess);
if (m_region.IsLocalMemory()) {
// Allocate physically contiguous memory. AllocateKfdMemory function call
// will fail if this flag is not supported in KFD.
kmt_alloc_flags.ui32.Contiguous =
(alloc_flags & core::MemoryRegion::AllocateContiguous
? 1
: kmt_alloc_flags.ui32.Contiguous);
}
//// Only allow using the suballocator for ordinary VRAM.
if (m_region.IsLocalMemory() && !kmt_alloc_flags.ui32.NoAddress) {
bool subAllocEnabled =
!core::Runtime::runtime_singleton_->flag().disable_fragment_alloc();
// Avoid modifying executable or queue allocations.
bool useSubAlloc = subAllocEnabled;
useSubAlloc &=
((alloc_flags & (~core::MemoryRegion::AllocateRestrict)) == 0);
if (useSubAlloc) {
*mem = m_region.fragment_alloc(size);
if ((alloc_flags & core::MemoryRegion::AllocateAsan) &&
hsaKmtReplaceAsanHeaderPage(*mem) != HSAKMT_STATUS_SUCCESS) {
m_region.fragment_free(*mem);
*mem = nullptr;
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
return HSA_STATUS_SUCCESS;
}
}
const uint32_t node_id =
(alloc_flags & core::MemoryRegion::AllocateGTTAccess)
? agent_node_id
: m_region.owner()->node_id();
//// Allocate memory.
//// If it fails attempt to release memory from the block allocator and retry.
*mem = AllocateKfdMemory(kmt_alloc_flags, node_id, size);
if (*mem == nullptr) {
m_region.owner()->Trim();
*mem = AllocateKfdMemory(kmt_alloc_flags, node_id, size);
}
if (*mem != nullptr) {
if (kmt_alloc_flags.ui32.NoAddress)
return HSA_STATUS_SUCCESS;
// Commit the memory.
// For system memory, on non-restricted allocation, map it to all GPUs. On
// restricted allocation, only CPU is allowed to access by default, so
// no need to map
// For local memory, only map it to the owning GPU. Mapping to other GPU,
// if the access is allowed, is performed on AllowAccess.
HsaMemMapFlags map_flag = m_region.map_flags();
size_t map_node_count = 1;
const uint32_t owner_node_id = m_region.owner()->node_id();
const uint32_t *map_node_id = &owner_node_id;
if (m_region.IsSystem()) {
if ((alloc_flags & core::MemoryRegion::AllocateRestrict) == 0) {
// Map to all GPU agents.
map_node_count = core::Runtime::runtime_singleton_->gpu_ids().size();
if (map_node_count == 0) {
// No need to pin since no GPU in the platform.
return HSA_STATUS_SUCCESS;
}
map_node_id = &core::Runtime::runtime_singleton_->gpu_ids()[0];
} else {
// No need to pin it for CPU exclusive access.
return HSA_STATUS_SUCCESS;
}
}
uint64_t alternate_va = 0;
const bool is_resident = MakeKfdMemoryResident(
map_node_count, map_node_id, *mem, size, &alternate_va, map_flag);
const bool require_pinning =
(!m_region.full_profile() || m_region.IsLocalMemory() ||
m_region.IsScratch());
if (require_pinning && !is_resident) {
FreeKfdMemory(*mem, size);
*mem = nullptr;
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
if ((alloc_flags & core::MemoryRegion::AllocateAsan) &&
hsaKmtReplaceAsanHeaderPage(*mem) != HSAKMT_STATUS_SUCCESS) {
FreeKfdMemory(*mem, size);
*mem = nullptr;
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
return HSA_STATUS_SUCCESS;
}
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
hsa_status_t KfdDriver::FreeMemory(void *mem, uint32_t node_id) {
return HSA_STATUS_SUCCESS;
hsa_status_t KfdDriver::FreeMemory(void *mem, size_t size) {
MakeKfdMemoryUnresident(mem);
return FreeKfdMemory(mem, size) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
}
hsa_status_t KfdDriver::CreateQueue(core::Queue &queue) {
@@ -92,5 +233,45 @@ hsa_status_t KfdDriver::DestroyQueue(core::Queue &queue) const {
return HSA_STATUS_SUCCESS;
}
void *KfdDriver::AllocateKfdMemory(const HsaMemFlags &flags, uint32_t node_id,
size_t size) {
void *mem = nullptr;
const HSAKMT_STATUS status = hsaKmtAllocMemory(node_id, size, flags, &mem);
return (status == HSAKMT_STATUS_SUCCESS) ? mem : nullptr;
}
bool KfdDriver::FreeKfdMemory(void *mem, size_t size) {
if (mem == nullptr || size == 0) {
debug_print("Invalid free ptr:%p size:%lu\n", mem, size);
return true;
}
if (hsaKmtFreeMemory(mem, size) != HSAKMT_STATUS_SUCCESS) {
debug_print("Failed to free ptr:%p size:%lu\n", mem, size);
return false;
}
return true;
}
bool KfdDriver::MakeKfdMemoryResident(size_t num_node, const uint32_t *nodes,
const void *mem, size_t size,
uint64_t *alternate_va,
HsaMemMapFlags map_flag) {
assert(num_node > 0);
assert(nodes);
*alternate_va = 0;
HSAKMT_STATUS kmt_status(hsaKmtMapMemoryToGPUNodes(
const_cast<void *>(mem), size, alternate_va, map_flag, num_node,
const_cast<uint32_t *>(nodes)));
return (kmt_status == HSAKMT_STATUS_SUCCESS);
}
void KfdDriver::MakeKfdMemoryUnresident(const void *mem) {
hsaKmtUnmapMemoryToGPU(const_cast<void *>(mem));
}
} // namespace AMD
} // namespace rocr
+7 -5
Просмотреть файл
@@ -47,6 +47,7 @@
#include <memory>
#include <string>
#include "core/inc/amd_memory_region.h"
#include "core/inc/runtime.h"
#include "uapi/amdxdna_accel.h"
@@ -89,17 +90,18 @@ hsa_status_t XdnaDriver::QueryKernelModeDriver(core::DriverQuery query) {
hsa_status_t
XdnaDriver::GetMemoryProperties(uint32_t node_id,
core::MemProperties &mprops) const {
core::MemoryRegion &mem_region) const {
return HSA_STATUS_SUCCESS;
}
hsa_status_t XdnaDriver::AllocateMemory(void **mem, size_t size,
uint32_t node_id,
core::MemFlags flags) {
hsa_status_t
XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region,
core::MemoryRegion::AllocateFlags alloc_flags,
void **mem, size_t size, uint32_t node_id) {
return HSA_STATUS_SUCCESS;
}
hsa_status_t XdnaDriver::FreeMemory(void *mem, uint32_t node_id) {
hsa_status_t XdnaDriver::FreeMemory(void *mem, size_t size) {
return HSA_STATUS_SUCCESS;
}
+12 -10
Просмотреть файл
@@ -49,11 +49,12 @@
#include <vector>
#include "core/inc/checked.h"
#include "core/inc/driver.h"
#include "core/inc/isa.h"
#include "core/inc/queue.h"
#include "core/inc/memory_region.h"
#include "core/util/utils.h"
#include "core/inc/queue.h"
#include "core/util/locks.h"
#include "core/util/utils.h"
namespace rocr {
@@ -117,19 +118,18 @@ class Agent : public Checked<0xF6BC25EB17E6F917> {
// @brief Agent class contructor.
//
// @param [in] type CPU or GPU or other.
explicit Agent(uint32_t node_id, DeviceType type)
: node_id_(node_id),
device_type_(uint32_t(type)),
profiling_enabled_(false),
enabled_(false) {
explicit Agent(DriverType drv_type, uint32_t node_id, DeviceType type)
: driver_type(drv_type), node_id_(node_id), device_type_(uint32_t(type)),
profiling_enabled_(false), enabled_(false) {
public_handle_ = Convert(this);
}
// @brief Agent class contructor.
//
// @param [in] type CPU or GPU or other.
explicit Agent(uint32_t node_id, uint32_t type)
: node_id_(node_id), device_type_(type), profiling_enabled_(false) {
explicit Agent(DriverType drv_type, uint32_t node_id, uint32_t type)
: driver_type(drv_type), node_id_(node_id), device_type_(type),
profiling_enabled_(false) {
public_handle_ = Convert(this);
}
@@ -315,7 +315,9 @@ class Agent : public Checked<0xF6BC25EB17E6F917> {
for (auto region : regions()) region->Trim();
}
protected:
const DriverType driver_type;
protected:
// Intention here is to have a polymorphic update procedure for public_handle_
// which is callable on any Agent* but only from some class dervied from
// Agent*. do_set_public_handle should remain protected or private in all
+125 -112
Просмотреть файл
@@ -51,15 +51,16 @@
#include "hsakmt/hsakmt.h"
#include "core/inc/runtime.h"
#include "core/inc/agent.h"
#include "core/inc/blit.h"
#include "core/inc/signal.h"
#include "core/inc/cache.h"
#include "core/inc/driver.h"
#include "core/inc/runtime.h"
#include "core/inc/scratch_cache.h"
#include "core/util/small_heap.h"
#include "core/util/locks.h"
#include "core/inc/signal.h"
#include "core/util/lazy_ptr.h"
#include "core/util/locks.h"
#include "core/util/small_heap.h"
#include "pcs/pcs_runtime.h"
namespace rocr {
@@ -72,142 +73,154 @@ typedef ScratchCache::ScratchInfo ScratchInfo;
class GpuAgentInt : public core::Agent {
public:
// @brief Constructor
GpuAgentInt(uint32_t node_id)
: core::Agent(node_id,core::Agent::DeviceType::kAmdGpuDevice) {}
GpuAgentInt(uint32_t node_id)
: core::Agent(core::DriverType::KFD, node_id,
core::Agent::DeviceType::kAmdGpuDevice) {}
// @brief Ensure blits are ready (performance hint).
virtual void PreloadBlits() {}
// @brief Ensure blits are ready (performance hint).
virtual void PreloadBlits() {}
// @brief Initialization hook invoked after tools library has loaded,
// to allow tools interception of interface functions.
//
// @retval HSA_STATUS_SUCCESS if initialization is successful.
virtual hsa_status_t PostToolsInit() = 0;
// @brief Initialization hook invoked after tools library has loaded,
// to allow tools interception of interface functions.
//
// @retval HSA_STATUS_SUCCESS if initialization is successful.
virtual hsa_status_t PostToolsInit() = 0;
// @brief Invoke the user provided callback for each region accessible by
// this agent.
//
// @param [in] include_peer If true, the callback will be also invoked on each
// peer memory region accessible by this agent. If false, only invoke the
// callback on memory region owned by this agent.
// @param [in] callback User provided callback function.
// @param [in] data User provided pointer as input for @p callback.
//
// @retval ::HSA_STATUS_SUCCESS if the callback function for each traversed
// region returns ::HSA_STATUS_SUCCESS.
virtual hsa_status_t VisitRegion(bool include_peer,
hsa_status_t (*callback)(hsa_region_t region,
void* data),
void* data) const = 0;
// @brief Invoke the user provided callback for each region accessible by
// this agent.
//
// @param [in] include_peer If true, the callback will be also invoked on
// each peer memory region accessible by this agent. If false, only invoke
// the callback on memory region owned by this agent.
// @param [in] callback User provided callback function.
// @param [in] data User provided pointer as input for @p callback.
//
// @retval ::HSA_STATUS_SUCCESS if the callback function for each traversed
// region returns ::HSA_STATUS_SUCCESS.
virtual hsa_status_t
VisitRegion(bool include_peer,
hsa_status_t (*callback)(hsa_region_t region, void *data),
void *data) const = 0;
// @brief Carve scratch memory for main from scratch pool.
//
// @param [in/out] scratch Structure to be populated with the carved memory
// information.
virtual void AcquireQueueMainScratch(ScratchInfo& scratch) = 0;
// @brief Carve scratch memory for main from scratch pool.
//
// @param [in/out] scratch Structure to be populated with the carved memory
// information.
virtual void AcquireQueueMainScratch(ScratchInfo &scratch) = 0;
// @brief Carve scratch memory for alt from scratch pool.
//
// @param [in/out] scratch Structure to be populated with the carved memory
// information.
virtual void AcquireQueueAltScratch(ScratchInfo& scratch) = 0;
// @brief Carve scratch memory for alt from scratch pool.
//
// @param [in/out] scratch Structure to be populated with the carved memory
// information.
virtual void AcquireQueueAltScratch(ScratchInfo &scratch) = 0;
// @brief Release scratch memory from main back to scratch pool.
//
// @param [in/out] scratch Scratch memory previously acquired with call to
// ::AcquireQueueMainScratch.
virtual void ReleaseQueueMainScratch(ScratchInfo& base) = 0;
// @brief Release scratch memory from main back to scratch pool.
//
// @param [in/out] scratch Scratch memory previously acquired with call to
// ::AcquireQueueMainScratch.
virtual void ReleaseQueueMainScratch(ScratchInfo &base) = 0;
// @brief Release scratch memory back from alternate to scratch pool.
//
// @param [in/out] scratch Scratch memory previously acquired with call to
// ::AcquireQueueAltcratch.
virtual void ReleaseQueueAltScratch(ScratchInfo& base) = 0;
// @brief Release scratch memory back from alternate to scratch pool.
//
// @param [in/out] scratch Scratch memory previously acquired with call to
// ::AcquireQueueAltcratch.
virtual void ReleaseQueueAltScratch(ScratchInfo &base) = 0;
// @brief Translate the kernel start and end dispatch timestamp from agent
// domain to host domain.
//
// @param [in] signal Pointer to signal that provides the dispatch timing.
// @param [out] time Structure to be populated with the host domain value.
virtual void TranslateTime(core::Signal* signal,
hsa_amd_profiling_dispatch_time_t& time) = 0;
// @brief Translate the kernel start and end dispatch timestamp from agent
// domain to host domain.
//
// @param [in] signal Pointer to signal that provides the dispatch timing.
// @param [out] time Structure to be populated with the host domain value.
virtual void TranslateTime(core::Signal *signal,
hsa_amd_profiling_dispatch_time_t &time) = 0;
// @brief Translate the async copy start and end timestamp from agent
// domain to host domain.
//
// @param [in] signal Pointer to signal that provides the async copy timing.
// @param [out] time Structure to be populated with the host domain value.
virtual void TranslateTime(core::Signal* signal, hsa_amd_profiling_async_copy_time_t& time) = 0;
// @brief Translate the async copy start and end timestamp from agent
// domain to host domain.
//
// @param [in] signal Pointer to signal that provides the async copy timing.
// @param [out] time Structure to be populated with the host domain value.
virtual void TranslateTime(core::Signal *signal,
hsa_amd_profiling_async_copy_time_t &time) = 0;
// @brief Translate timestamp agent domain to host domain.
//
// @param [out] time Timestamp in agent domain.
virtual uint64_t TranslateTime(uint64_t tick) = 0;
// @brief Translate timestamp agent domain to host domain.
//
// @param [out] time Timestamp in agent domain.
virtual uint64_t TranslateTime(uint64_t tick) = 0;
// @brief Invalidate caches on the agent which may hold code object data.
virtual void InvalidateCodeCaches() = 0;
// @brief Invalidate caches on the agent which may hold code object data.
virtual void InvalidateCodeCaches() = 0;
// @brief Sets the coherency type of this agent.
//
// @param [in] type New coherency type.
//
// @retval true The new coherency type is set successfuly.
virtual bool current_coherency_type(hsa_amd_coherency_type_t type) = 0;
// @brief Sets the coherency type of this agent.
//
// @param [in] type New coherency type.
//
// @retval true The new coherency type is set successfuly.
virtual bool current_coherency_type(hsa_amd_coherency_type_t type) = 0;
// @brief Returns the current coherency type of this agent.
//
// @retval Coherency type.
virtual hsa_amd_coherency_type_t current_coherency_type() const = 0;
// @brief Returns the current coherency type of this agent.
//
// @retval Coherency type.
virtual hsa_amd_coherency_type_t current_coherency_type() const = 0;
virtual void RegisterGangPeer(core::Agent& gang_peer, unsigned int bandwidth_factor) = 0;
virtual void RegisterGangPeer(core::Agent &gang_peer,
unsigned int bandwidth_factor) = 0;
virtual void RegisterRecSdmaEngIdMaskPeer(core::Agent& gang_peer, uint32_t rec_sdma_eng_id_mask) = 0;
virtual void RegisterRecSdmaEngIdMaskPeer(core::Agent &gang_peer,
uint32_t rec_sdma_eng_id_mask) = 0;
// @brief Query if agent represent Kaveri GPU.
//
// @retval true if agent is Kaveri GPU.
virtual bool is_kv_device() const = 0;
// @brief Query if agent represent Kaveri GPU.
//
// @retval true if agent is Kaveri GPU.
virtual bool is_kv_device() const = 0;
// @brief Query the agent HSA profile.
//
// @retval HSA profile.
virtual hsa_profile_t profile() const = 0;
// @brief Query the agent HSA profile.
//
// @retval HSA profile.
virtual hsa_profile_t profile() const = 0;
// @brief Query the agent memory bus width in bit.
//
// @retval Bus width in bit.
virtual uint32_t memory_bus_width() const = 0;
// @brief Query the agent memory bus width in bit.
//
// @retval Bus width in bit.
virtual uint32_t memory_bus_width() const = 0;
// @brief Query the agent memory maximum frequency in MHz.
//
// @retval Bus width in MHz.
virtual uint32_t memory_max_frequency() const = 0;
// @brief Query the agent memory maximum frequency in MHz.
//
// @retval Bus width in MHz.
virtual uint32_t memory_max_frequency() const = 0;
// @brief Whether agent supports asynchronous scratch reclaim. Depends on CP FW
virtual bool AsyncScratchReclaimEnabled() const = 0;
// @brief Whether agent supports asynchronous scratch reclaim. Depends on CP
// FW
virtual bool AsyncScratchReclaimEnabled() const = 0;
// @brief Update the agent's scratch use-once threshold.
// Only valid when async scratch reclaim is supported
// @retval HSA_STATUS_SUCCESS if successful
virtual hsa_status_t SetAsyncScratchThresholds(size_t use_once_limit) = 0;
// @brief Update the agent's scratch use-once threshold.
// Only valid when async scratch reclaim is supported
// @retval HSA_STATUS_SUCCESS if successful
virtual hsa_status_t SetAsyncScratchThresholds(size_t use_once_limit) = 0;
// @brief Iterate through supported PC Sampling configurations
// @retval HSA_STATUS_SUCCESS if successful
virtual hsa_status_t PcSamplingIterateConfig(hsa_ven_amd_pcs_iterate_configuration_callback_t cb,
void* cb_data) = 0;
// @brief Iterate through supported PC Sampling configurations
// @retval HSA_STATUS_SUCCESS if successful
virtual hsa_status_t
PcSamplingIterateConfig(hsa_ven_amd_pcs_iterate_configuration_callback_t cb,
void *cb_data) = 0;
virtual hsa_status_t PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& session) = 0;
virtual hsa_status_t
PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession &session) = 0;
virtual hsa_status_t PcSamplingCreateFromId(HsaPcSamplingTraceId pcsId,
pcs::PcsRuntime::PcSamplingSession& session) = 0;
virtual hsa_status_t
PcSamplingCreateFromId(HsaPcSamplingTraceId pcsId,
pcs::PcsRuntime::PcSamplingSession &session) = 0;
virtual hsa_status_t PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session) = 0;
virtual hsa_status_t
PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession &session) = 0;
virtual hsa_status_t PcSamplingStart(pcs::PcsRuntime::PcSamplingSession& session) = 0;
virtual hsa_status_t
PcSamplingStart(pcs::PcsRuntime::PcSamplingSession &session) = 0;
virtual hsa_status_t PcSamplingStop(pcs::PcsRuntime::PcSamplingSession& session) = 0;
virtual hsa_status_t
PcSamplingStop(pcs::PcsRuntime::PcSamplingSession &session) = 0;
virtual hsa_status_t PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession& session) = 0;
virtual hsa_status_t
PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession &session) = 0;
};
class GpuAgent : public GpuAgentInt {
+37 -7
Просмотреть файл
@@ -43,11 +43,21 @@
#ifndef HSA_RUNTIME_CORE_INC_AMD_KFD_DRIVER_H_
#define HSA_RUNTIME_CORE_INC_AMD_KFD_DRIVER_H_
#include "core/inc/driver.h"
#include <string>
#include "hsakmt/hsakmt.h"
#include "core/inc/driver.h"
#include "core/inc/memory_region.h"
namespace rocr {
namespace core {
class Queue;
}
namespace AMD {
class KfdDriver : public core::Driver {
@@ -57,13 +67,33 @@ public:
static hsa_status_t DiscoverDriver();
hsa_status_t QueryKernelModeDriver(core::DriverQuery query) override;
hsa_status_t GetMemoryProperties(uint32_t node_id,
core::MemProperties &mprops) const override;
hsa_status_t AllocateMemory(void **mem, size_t size, uint32_t node_id,
core::MemFlags flags) override;
hsa_status_t FreeMemory(void *mem, uint32_t node_id) override;
hsa_status_t
GetMemoryProperties(uint32_t node_id,
core::MemoryRegion &mem_region) const override;
hsa_status_t AllocateMemory(const core::MemoryRegion &mem_region,
core::MemoryRegion::AllocateFlags alloc_flags,
void **mem, size_t size,
uint32_t node_id) override;
hsa_status_t FreeMemory(void *mem, size_t size) override;
hsa_status_t CreateQueue(core::Queue &queue) override;
hsa_status_t DestroyQueue(core::Queue &queue) const override;
private:
/// @brief Allocate agent accessible memory (system / local memory).
static void *AllocateKfdMemory(const HsaMemFlags &flags, uint32_t node_id,
size_t size);
/// @brief Free agent accessible memory (system / local memory).
static bool FreeKfdMemory(void *mem, size_t size);
/// @brief Pin memory.
static bool MakeKfdMemoryResident(size_t num_node, const uint32_t *nodes,
const void *mem, size_t size,
uint64_t *alternate_va,
HsaMemMapFlags map_flag);
/// @brief Unpin memory.
static void MakeKfdMemoryUnresident(const void *mem);
};
} // namespace AMD
+9 -8
Просмотреть файл
@@ -77,13 +77,6 @@ class MemoryRegion : public core::MemoryRegion {
return reinterpret_cast<MemoryRegion*>(region.handle);
}
/// @brief Allocate agent accessible memory (system / local memory).
static void* AllocateKfdMemory(const HsaMemFlags& flag, HSAuint32 node_id,
size_t size);
/// @brief Free agent accessible memory (system / local memory).
static bool FreeKfdMemory(void* ptr, size_t size);
static bool RegisterMemory(void* ptr, size_t size, const HsaMemFlags& MemFlags);
static void DeregisterMemory(void* ptr);
@@ -175,7 +168,15 @@ class MemoryRegion : public core::MemoryRegion {
__forceinline size_t GetPageSize() const { return kPageSize_; }
private:
__forceinline const HsaMemFlags &mem_flags() const { return mem_flag_; }
__forceinline const HsaMemMapFlags &map_flags() const { return map_flag_; }
void *fragment_alloc(size_t size) const {
return fragment_allocator_.alloc(size);
}
bool fragment_free(void *mem) const { return fragment_allocator_.free(mem); }
private:
const HsaMemoryProperties mem_props_;
HsaMemFlags mem_flag_;
+13 -5
Просмотреть файл
@@ -45,8 +45,13 @@
#include <memory>
#include "core/inc/driver.h"
#include "core/inc/memory_region.h"
namespace rocr {
namespace core {
class Queue;
}
namespace AMD {
class XdnaDriver : public core::Driver {
@@ -57,11 +62,14 @@ public:
static hsa_status_t DiscoverDriver();
hsa_status_t QueryKernelModeDriver(core::DriverQuery query) override;
hsa_status_t GetMemoryProperties(uint32_t node_id,
core::MemProperties &mprops) const override;
hsa_status_t AllocateMemory(void **mem, size_t size, uint32_t node_id,
core::MemFlags flags) override;
hsa_status_t FreeMemory(void *mem, uint32_t node_id) override;
hsa_status_t
GetMemoryProperties(uint32_t node_id,
core::MemoryRegion &mem_region) const override;
hsa_status_t AllocateMemory(const core::MemoryRegion &mem_region,
core::MemoryRegion::AllocateFlags alloc_flags,
void **mem, size_t size,
uint32_t node_id) override;
hsa_status_t FreeMemory(void *mem, size_t size) override;
hsa_status_t CreateQueue(core::Queue &queue) override;
hsa_status_t DestroyQueue(core::Queue &queue) const override;
+18 -13
Просмотреть файл
@@ -46,20 +46,13 @@
#include <limits>
#include <string>
#include "core/inc/agent.h"
#include "core/inc/memory_region.h"
#include "inc/hsa.h"
namespace rocr {
namespace core {
using MemFlags = uint32_t;
struct MemProperties {
MemFlags flags_;
size_t size_bytes_;
uint64_t virtual_base_addr_;
};
class Queue;
struct DriverVersionInfo {
uint32_t major;
@@ -85,17 +78,27 @@ class Driver {
/// @retval HSA_STATUS_SUCCESS if the kernel-model driver query was
/// successful.
virtual hsa_status_t QueryKernelModeDriver(DriverQuery query) = 0;
/// @brief Open a connection to the driver using name_.
/// @retval HSA_STATUS_SUCCESS if the driver was opened successfully.
hsa_status_t Open();
/// @brief Close a connection to the open driver using fd_.
/// @retval HSA_STATUS_SUCCESS if the driver was opened successfully.
hsa_status_t Close();
/// @brief Get driver version information.
/// @retval DriverVersionInfo containing the driver's version information.
DriverVersionInfo Version() const { return version_; }
const DriverVersionInfo &Version() const { return version_; }
virtual hsa_status_t GetMemoryProperties(uint32_t node_id, MemProperties &mprops) const = 0;
/// @brief Get the memory properties of a specific node.
/// @param node_id Node ID of the agent
/// @param[in, out] mem_region MemoryRegion object whose properties will be
/// retrieved.
/// @retval HSA_STATUS_SUCCESS if the driver sucessfully returns the node's
/// memory properties.
virtual hsa_status_t GetMemoryProperties(uint32_t node_id,
MemoryRegion &mem_region) const = 0;
/// @brief Allocate agent-accessible memory (system or agent-local memory).
///
@@ -103,10 +106,12 @@ class Driver {
///
/// @retval HSA_STATUS_SUCCESS if memory was successfully allocated or
/// hsa_status_t error code if the memory allocation failed.
virtual hsa_status_t AllocateMemory(void** mem, size_t size, uint32_t node_id,
MemFlags flags) = 0;
virtual hsa_status_t AllocateMemory(const MemoryRegion &mem_region,
MemoryRegion::AllocateFlags alloc_flags,
void **mem, size_t size,
uint32_t node_id) = 0;
virtual hsa_status_t FreeMemory(void* mem, uint32_t node_id) = 0;
virtual hsa_status_t FreeMemory(void *mem, size_t size) = 0;
virtual hsa_status_t CreateQueue(Queue &queue) = 0;
+3 -1
Просмотреть файл
@@ -43,12 +43,14 @@
#include "core/inc/amd_aie_agent.h"
#include "core/inc/amd_aie_aql_queue.h"
#include "core/inc/driver.h"
namespace rocr {
namespace AMD {
AieAgent::AieAgent(uint32_t node)
: core::Agent(node, core::Agent::DeviceType::kAmdAieDevice),
: core::Agent(core::DriverType::XDNA, node,
core::Agent::DeviceType::kAmdAieDevice),
max_queues_(core::Runtime::runtime_singleton_->flag().max_queues()) {
InitRegionList();
}
+4 -2
Просмотреть файл
@@ -47,14 +47,16 @@
#include <thread>
#include "core/inc/amd_memory_region.h"
#include "core/inc/driver.h"
#include "core/inc/host_queue.h"
#include "inc/hsa_ext_image.h"
namespace rocr {
namespace AMD {
CpuAgent::CpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
: core::Agent(node, kAmdCpuDevice), properties_(node_props) {
CpuAgent::CpuAgent(HSAuint32 node, const HsaNodeProperties &node_props)
: core::Agent(core::DriverType::KFD, node, kAmdCpuDevice),
properties_(node_props) {
InitRegionList();
InitCacheList();
+4 -128
Просмотреть файл
@@ -59,25 +59,6 @@ namespace AMD {
size_t MemoryRegion::max_sysmem_alloc_size_ = 0;
size_t MemoryRegion::kPageSize_ = sysconf(_SC_PAGESIZE);
void* MemoryRegion::AllocateKfdMemory(const HsaMemFlags& flag, HSAuint32 node_id, size_t size) {
void* ret = NULL;
const HSAKMT_STATUS status = hsaKmtAllocMemory(node_id, size, flag, &ret);
return (status == HSAKMT_STATUS_SUCCESS) ? ret : NULL;
}
bool MemoryRegion::FreeKfdMemory(void* ptr, size_t size) {
if (ptr == NULL || size == 0) {
debug_print("Invalid free ptr:%p size:%lu\n", ptr, size);
return true;
}
if (hsaKmtFreeMemory(ptr, size) != HSAKMT_STATUS_SUCCESS) {
debug_print("Failed to free ptr:%p size:%lu\n", ptr, size);
return false;
}
return true;
}
bool MemoryRegion::RegisterMemory(void* ptr, size_t size, const HsaMemFlags& MemFlags) {
assert(ptr != NULL);
assert(size != 0);
@@ -198,112 +179,8 @@ hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags,
size = AlignUp(size, kPageSize_);
HsaMemFlags kmt_alloc_flags(mem_flag_);
kmt_alloc_flags.ui32.ExecuteAccess =
(alloc_flags & AllocateExecutable ? 1 : 0);
kmt_alloc_flags.ui32.AQLQueueMemory =
(alloc_flags & AllocateDoubleMap ? 1 : 0);
if (IsSystem() && (alloc_flags & AllocateNonPaged))
kmt_alloc_flags.ui32.NonPaged = 1;
// Allocating a memory handle for virtual memory
kmt_alloc_flags.ui32.NoAddress = !!(alloc_flags & AllocateMemoryOnly);
// Allocate pseudo fine grain memory
kmt_alloc_flags.ui32.CoarseGrain = (alloc_flags & AllocatePCIeRW ? 0 : kmt_alloc_flags.ui32.CoarseGrain);
kmt_alloc_flags.ui32.NoSubstitute = (alloc_flags & AllocatePinned ? 1 : kmt_alloc_flags.ui32.NoSubstitute);
kmt_alloc_flags.ui32.GTTAccess = (alloc_flags & AllocateGTTAccess ? 1 : kmt_alloc_flags.ui32.GTTAccess);
if (IsLocalMemory()) {
// Allocate physically contiguous memory - AllocateKfdMemory function call will fail
// if this flag is not supported in KFD.
kmt_alloc_flags.ui32.Contiguous =
(alloc_flags & AllocateContiguous ? 1 : kmt_alloc_flags.ui32.Contiguous);
}
// Only allow using the suballocator for ordinary VRAM.
if (IsLocalMemory() && !kmt_alloc_flags.ui32.NoAddress) {
bool subAllocEnabled = !core::Runtime::runtime_singleton_->flag().disable_fragment_alloc();
// Avoid modifying executable or queue allocations.
bool useSubAlloc = subAllocEnabled;
useSubAlloc &= ((alloc_flags & (~AllocateRestrict)) == 0);
if (useSubAlloc) {
*address = fragment_allocator_.alloc(size);
if ((alloc_flags & AllocateAsan) &&
hsaKmtReplaceAsanHeaderPage(*address) != HSAKMT_STATUS_SUCCESS) {
fragment_allocator_.free(*address);
*address = NULL;
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
return HSA_STATUS_SUCCESS;
}
}
const HSAuint32 node_id = (alloc_flags & AllocateGTTAccess) ? agent_node_id : owner()->node_id();
// Allocate memory.
// If it fails attempt to release memory from the block allocator and retry.
*address = AllocateKfdMemory(kmt_alloc_flags, node_id, size);
if (*address == nullptr) {
owner()->Trim();
*address = AllocateKfdMemory(kmt_alloc_flags, node_id, size);
}
if (*address != nullptr) {
if (kmt_alloc_flags.ui32.NoAddress) return HSA_STATUS_SUCCESS;
// Commit the memory.
// For system memory, on non-restricted allocation, map it to all GPUs. On
// restricted allocation, only CPU is allowed to access by default, so
// no need to map
// For local memory, only map it to the owning GPU. Mapping to other GPU,
// if the access is allowed, is performed on AllowAccess.
HsaMemMapFlags map_flag = map_flag_;
size_t map_node_count = 1;
const uint32_t owner_node_id = owner()->node_id();
const uint32_t* map_node_id = &owner_node_id;
if (IsSystem()) {
if ((alloc_flags & AllocateRestrict) == 0) {
// Map to all GPU agents.
map_node_count = core::Runtime::runtime_singleton_->gpu_ids().size();
if (map_node_count == 0) {
// No need to pin since no GPU in the platform.
return HSA_STATUS_SUCCESS;
}
map_node_id = &core::Runtime::runtime_singleton_->gpu_ids()[0];
} else {
// No need to pin it for CPU exclusive access.
return HSA_STATUS_SUCCESS;
}
}
uint64_t alternate_va = 0;
const bool is_resident = MakeKfdMemoryResident(
map_node_count, map_node_id, *address, size, &alternate_va, map_flag);
const bool require_pinning =
(!full_profile() || IsLocalMemory() || IsScratch());
if (require_pinning && !is_resident) {
FreeKfdMemory(*address, size);
*address = NULL;
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
if ((alloc_flags & AllocateAsan) &&
hsaKmtReplaceAsanHeaderPage(*address) != HSAKMT_STATUS_SUCCESS) {
FreeKfdMemory(*address, size);
*address = NULL;
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
return HSA_STATUS_SUCCESS;
}
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
return core::Runtime::runtime_singleton_->AgentDriver(owner()->driver_type)
.AllocateMemory(*this, alloc_flags, address, size, agent_node_id);
}
hsa_status_t MemoryRegion::Free(void* address, size_t size) const {
@@ -314,9 +191,8 @@ hsa_status_t MemoryRegion::Free(void* address, size_t size) const {
hsa_status_t MemoryRegion::FreeImpl(void* address, size_t size) const {
if (fragment_allocator_.free(address)) return HSA_STATUS_SUCCESS;
MakeKfdMemoryUnresident(address);
return FreeKfdMemory(address, size) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
return core::Runtime::runtime_singleton_->AgentDriver(owner()->driver_type)
.FreeMemory(address, size);
}
// TODO: Look into a better name and/or making this process transparent to exporting.