340f3aa887
* clr: Implement dynamic stream to HW queue assignment This change implements dynamic stream to hardware queue (HWq) mapping with the following features: * Queue depth heuristics with weights for optimal HWq assignment * Make last used queue sticky for better locality * Use pipe HWq to pipe mapping - gfx9 follows a round-robin queue to pipe mapping based on creation order (single process per device only, as pipe ID is statically assigned by runtime) * More aggressive heuristic usage for better queue distribution * Extend dynamic queues support for all stream priorities Environment variables: * DEBUG_HIP_DYNAMIC_QUEUE: 0 - disabled, 1 - Depth heuristics 2 - Depth+Pipe heuristics * DEBUG_HIP_IGNORE_STREAM_PRIORITY=1: ignore priority stream creation * clr: Clean up last_used_queue_
791 строка
30 KiB
C++
791 строка
30 KiB
C++
/* Copyright (c) 2009 - 2025 Advanced Micro Devices, Inc.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE. */
|
|
|
|
#pragma once
|
|
|
|
#include "top.hpp"
|
|
#include "CL/cl.h"
|
|
#include "device/device.hpp"
|
|
#include "platform/command.hpp"
|
|
#include "platform/program.hpp"
|
|
#include "platform/perfctr.hpp"
|
|
#include "platform/memory.hpp"
|
|
#include "utils/concurrent.hpp"
|
|
#include "thread/thread.hpp"
|
|
#include "thread/monitor.hpp"
|
|
#include "utils/versions.hpp"
|
|
|
|
#include "device/rocm/rocrctx.hpp"
|
|
#include "device/rocm/rocsettings.hpp"
|
|
#include "device/rocm/rocvirtual.hpp"
|
|
#include "device/rocm/rocdefs.hpp"
|
|
#include "device/rocm/rocprintf.hpp"
|
|
#include "device/rocm/rocglinterop.hpp"
|
|
|
|
#include <atomic>
|
|
#include <iostream>
|
|
#include <vector>
|
|
#include <memory>
|
|
|
|
/*! \addtogroup HSA
|
|
* @{
|
|
*/
|
|
|
|
//! HSA Device Implementation
|
|
namespace amd::roc {
|
|
|
|
/**
|
|
* @brief List of environment variables that could be used to
|
|
* configure the behavior of Hsa Runtime
|
|
*/
|
|
#define ENVVAR_HSA_POLL_KERNEL_COMPLETION "HSA_POLL_COMPLETION"
|
|
|
|
//! Forward declarations
|
|
class Command;
|
|
class Device;
|
|
class GpuCommand;
|
|
class Heap;
|
|
class HeapBlock;
|
|
class Program;
|
|
class Kernel;
|
|
class Memory;
|
|
class Resource;
|
|
class VirtualDevice;
|
|
class PrintfDbg;
|
|
|
|
class ProfilingSignal : public amd::ReferenceCountedObject {
|
|
public:
|
|
hsa_signal_t signal_; //!< HSA signal to track profiling information
|
|
Timestamp* ts_; //!< Timestamp object associated with the signal
|
|
HwQueueEngine engine_; //!< Engine used with this signal
|
|
amd::Monitor lock_; //!< Signal lock for update
|
|
|
|
typedef union {
|
|
struct {
|
|
uint32_t done_ : 1; //!< True if signal is done
|
|
uint32_t isPacketDispatch_ : 1; //!< True if the packet, used with the signal, is dispatch
|
|
uint32_t interrupt_ : 1; //!< True if the signal will trigger an interrupt
|
|
uint32_t reserved_ : 29;
|
|
};
|
|
uint32_t data_;
|
|
} Flags;
|
|
|
|
Flags flags_;
|
|
|
|
//! Cached timing data - populated when signal completes, avoids repeated HSA calls
|
|
struct CachedTiming {
|
|
uint64_t start_ = 0; //!< Cached start timestamp from HSA
|
|
uint64_t end_ = 0; //!< Cached end timestamp from HSA
|
|
bool valid_ = false; //!< True if timing data has been cached
|
|
};
|
|
CachedTiming cached_timing_;
|
|
|
|
ProfilingSignal()
|
|
: ts_(nullptr),
|
|
engine_(HwQueueEngine::Compute),
|
|
lock_(true) /* Signal Ops Lock */
|
|
{
|
|
signal_.handle = 0;
|
|
flags_.data_ = 0;
|
|
flags_.done_ = true;
|
|
}
|
|
|
|
virtual ~ProfilingSignal();
|
|
amd::Monitor& LockSignalOps() { return lock_; }
|
|
|
|
//! Cache timing data from HSA for this signal (called once when signal completes)
|
|
void CacheTimingData(hsa_agent_t gpu_device);
|
|
|
|
//! Reset cached timing for signal reuse
|
|
void ResetCachedTiming() {
|
|
amd::ScopedLock lock(lock_);
|
|
cached_timing_.start_ = 0;
|
|
cached_timing_.end_ = 0;
|
|
cached_timing_.valid_ = false;
|
|
}
|
|
|
|
//! Check if timing is already cached
|
|
bool IsTimingCached() const { return cached_timing_.valid_; }
|
|
|
|
//! Get cached timing values
|
|
void GetCachedTiming(uint64_t& start, uint64_t& end) {
|
|
amd::ScopedLock lock(lock_);
|
|
start = cached_timing_.start_;
|
|
end = cached_timing_.end_;
|
|
}
|
|
};
|
|
|
|
class Sampler : public device::Sampler {
|
|
public:
|
|
//! Constructor
|
|
Sampler(const Device& dev) : dev_(dev) {}
|
|
|
|
//! Default destructor for the device memory object
|
|
virtual ~Sampler();
|
|
|
|
//! Creates a device sampler from the OCL sampler state
|
|
bool create(const amd::Sampler& owner //!< AMD sampler object
|
|
);
|
|
|
|
private:
|
|
void fillSampleDescriptor(hsa_ext_sampler_descriptor_v2_t& samplerDescriptor,
|
|
const amd::Sampler& sampler) const;
|
|
Sampler& operator=(const Sampler&);
|
|
|
|
//! Disable operator=
|
|
Sampler(const Sampler&);
|
|
|
|
const Device& dev_; //!< Device object associated with the sampler
|
|
|
|
hsa_ext_sampler_t hsa_sampler;
|
|
};
|
|
|
|
// A NULL Device type used only for offline compilation
|
|
// Only functions that are used for compilation will be in this device
|
|
class NullDevice : public amd::Device {
|
|
public:
|
|
//! constructor
|
|
NullDevice(){};
|
|
|
|
//! create the device
|
|
bool create(const amd::Isa& isa);
|
|
|
|
//! Initialise all the offline devices that can be used for compilation
|
|
static bool init();
|
|
//! Teardown for offline devices
|
|
static void tearDown();
|
|
|
|
//! Destructor for the Null device
|
|
virtual ~NullDevice();
|
|
|
|
const Settings& settings() const { return static_cast<Settings&>(*settings_); }
|
|
|
|
//! Construct an device program object from the ELF assuming it is valid
|
|
device::Program* createProgram(amd::Program& owner,
|
|
amd::option::Options* options = nullptr) override;
|
|
|
|
// List of dummy functions which are disabled for NullDevice
|
|
|
|
//! Create a new virtual device environment.
|
|
device::VirtualDevice* createVirtualDevice(amd::CommandQueue* queue = nullptr) override {
|
|
ShouldNotReachHere();
|
|
return nullptr;
|
|
}
|
|
|
|
virtual bool registerSvmMemory(void* ptr, size_t size) const {
|
|
ShouldNotReachHere();
|
|
return false;
|
|
}
|
|
|
|
virtual void deregisterSvmMemory(void* ptr) const { ShouldNotReachHere(); }
|
|
|
|
//! Just returns nullptr for the dummy device
|
|
device::Memory* createMemory(amd::Memory& owner) const override {
|
|
ShouldNotReachHere();
|
|
return nullptr;
|
|
}
|
|
device::Memory* createMemory(size_t size, size_t alignment = 0) const override {
|
|
ShouldNotReachHere();
|
|
return nullptr;
|
|
}
|
|
//! Sampler object allocation
|
|
bool createSampler(const amd::Sampler& owner, //!< abstraction layer sampler object
|
|
device::Sampler** sampler //!< device sampler object
|
|
) const override {
|
|
ShouldNotReachHere();
|
|
return true;
|
|
}
|
|
|
|
//! Just returns nullptr for the dummy device
|
|
device::Memory* createView(
|
|
amd::Memory& owner, //!< Owner memory object
|
|
const device::Memory& parent //!< Parent device memory object for the view
|
|
) const override {
|
|
ShouldNotReachHere();
|
|
return nullptr;
|
|
}
|
|
|
|
device::Signal* createSignal() const override {
|
|
ShouldNotReachHere();
|
|
return nullptr;
|
|
}
|
|
|
|
//! Just returns nullptr for the dummy device
|
|
void* svmAlloc(amd::Context& context, //!< The context used to create a buffer
|
|
size_t size, //!< size of svm spaces
|
|
size_t alignment, //!< alignment requirement of svm spaces
|
|
cl_svm_mem_flags flags, //!< flags of creation svm spaces
|
|
void* svmPtr //!< existing svm pointer for mGPU case
|
|
) const override {
|
|
ShouldNotReachHere();
|
|
return nullptr;
|
|
}
|
|
|
|
//! Just returns nullptr for the dummy device
|
|
void svmFree(void* ptr //!< svm pointer needed to be freed
|
|
) const override {
|
|
ShouldNotReachHere();
|
|
return;
|
|
}
|
|
|
|
void* virtualAlloc(void* req_addr, size_t size, size_t alignment) override {
|
|
ShouldNotReachHere();
|
|
return nullptr;
|
|
}
|
|
|
|
bool virtualFree(void* addr) override {
|
|
ShouldNotReachHere();
|
|
return true;
|
|
}
|
|
|
|
virtual bool SetMemAccess(void* va_addr, size_t va_size, VmmAccess access_flags,
|
|
VmmLocationType = VmmLocationType::kDevice) override {
|
|
ShouldNotReachHere();
|
|
return false;
|
|
}
|
|
|
|
virtual bool GetMemAccess(void* va_addr, VmmAccess* access_flags_ptr) const override {
|
|
ShouldNotReachHere();
|
|
return false;
|
|
}
|
|
|
|
virtual bool ValidateMemAccess(amd::Memory& mem, bool read_write) const override {
|
|
ShouldNotReachHere();
|
|
return true;
|
|
}
|
|
|
|
//! Determine if we can use device memory for SVM
|
|
const bool forceFineGrain(amd::Memory* memory) const {
|
|
return (memory->getContext().devices().size() > 1);
|
|
}
|
|
|
|
virtual bool importExtSemaphore(void** extSemahore, const amd::Os::FileDesc& handle,
|
|
amd::ExternalSemaphoreHandleType sem_handle_type) override {
|
|
ShouldNotReachHere();
|
|
return false;
|
|
}
|
|
|
|
void DestroyExtSemaphore(void* extSemaphore) override { ShouldNotReachHere(); }
|
|
|
|
//! Acquire external graphics API object in the host thread
|
|
//! Needed for OpenGL objects on CPU device
|
|
|
|
bool bindExternalDevice(uint flags, void* const pDevice[], void* pContext,
|
|
bool validateOnly) override {
|
|
ShouldNotReachHere();
|
|
return false;
|
|
}
|
|
|
|
bool unbindExternalDevice(uint flags, void* const pDevice[], void* pContext,
|
|
bool validateOnly) override {
|
|
ShouldNotReachHere();
|
|
return false;
|
|
}
|
|
|
|
//! Releases non-blocking map target memory
|
|
virtual void freeMapTarget(amd::Memory& mem, void* target) { ShouldNotReachHere(); }
|
|
|
|
//! Empty implementation on Null device
|
|
bool globalFreeMemory(size_t* freeMemory) const override {
|
|
ShouldNotReachHere();
|
|
return false;
|
|
}
|
|
|
|
//! Empty implementation on Null device
|
|
bool amdFileRead(amd::Os::FileDesc handle, void* devicePtr, uint64_t size, int64_t file_offset,
|
|
uint64_t* size_copied, int32_t* status) override {
|
|
ShouldNotReachHere();
|
|
return false;
|
|
}
|
|
|
|
//! Empty implementation on Null device
|
|
bool amdFileWrite(amd::Os::FileDesc handle, void* devicePtr, uint64_t size, int64_t file_offset,
|
|
uint64_t* size_copied, int32_t* status) override {
|
|
ShouldNotReachHere();
|
|
return false;
|
|
}
|
|
|
|
bool SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput,
|
|
cl_set_device_clock_mode_output_amd* pSetClockModeOutput) override {
|
|
return true;
|
|
}
|
|
|
|
bool IsHwEventReady(const amd::Event& event, bool wait = false,
|
|
amd::SyncPolicy policy = amd::SyncPolicy::Auto) const override {
|
|
return false;
|
|
}
|
|
|
|
void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const override {};
|
|
void ReleaseGlobalSignal(void* signal) const override {}
|
|
|
|
#if defined(__clang__)
|
|
#if __has_feature(address_sanitizer)
|
|
virtual device::UriLocator* createUriLocator() const {
|
|
ShouldNotReachHere();
|
|
return nullptr;
|
|
}
|
|
#endif
|
|
#endif
|
|
|
|
private:
|
|
static constexpr bool offlineDevice_ = true;
|
|
};
|
|
|
|
struct AgentInfo {
|
|
hsa_agent_t agent;
|
|
hsa_amd_memory_pool_t fine_grain_pool;
|
|
hsa_amd_memory_pool_t coarse_grain_pool;
|
|
hsa_amd_memory_pool_t kern_arg_pool;
|
|
hsa_amd_memory_pool_t ext_fine_grain_pool;
|
|
};
|
|
|
|
//! A HSA device ordinal (physical HSA device)
|
|
class Device : public NullDevice {
|
|
public:
|
|
//! Initialise the whole HSA device subsystem (init, device enumeration, etc).
|
|
static bool init();
|
|
static void tearDown();
|
|
|
|
//! Lookup all AMD HSA devices and memory regions.
|
|
static hsa_status_t iterateAgentCallback(hsa_agent_t agent, void* data);
|
|
static hsa_status_t iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t region, void* data);
|
|
static hsa_status_t iterateCpuMemoryPoolCallback(hsa_amd_memory_pool_t region, void* data);
|
|
static hsa_status_t loaderQueryHostAddress(const void* device, const void** host);
|
|
|
|
static bool loadHsaModules();
|
|
|
|
hsa_agent_t getBackendDevice() const { return bkendDevice_; }
|
|
//! Get the CPU agent with the least NUMA distance to this GPU
|
|
const hsa_agent_t& getCpuAgent() const { return cpu_agent_info_->agent; }
|
|
|
|
//! Get the CPU agent that is in a 'index' NUMA node
|
|
const hsa_agent_t getCpuAgent(int index) const {
|
|
if ((index < 0) || (index >= cpu_agents_.size())) {
|
|
// Return default CPU agent
|
|
return cpu_agent_info_->agent;
|
|
}
|
|
return cpu_agents_[index].agent;
|
|
}
|
|
|
|
void setupCpuAgent(); // Setup the CPU agent which has the least NUMA distance to this GPU
|
|
|
|
void checkAtomicSupport(); //!< Check the support for pcie atomics
|
|
|
|
//! Destructor for the physical HSA device
|
|
virtual ~Device();
|
|
|
|
// Temporary, delete it later when HSA Runtime and KFD is fully fucntional.
|
|
void fake_device();
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
// TODO: Below are all mocked up virtual functions from amd::Device, they may
|
|
// need real implementation.
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
//! Instantiate a new virtual device
|
|
virtual device::VirtualDevice* createVirtualDevice(amd::CommandQueue* queue = nullptr) override;
|
|
|
|
//! Construct an device program object from the ELF assuming it is valid
|
|
virtual device::Program* createProgram(amd::Program& owner,
|
|
amd::option::Options* options = nullptr) override;
|
|
|
|
virtual device::Memory* createMemory(amd::Memory& owner) const override;
|
|
virtual device::Memory* createMemory(size_t size, size_t alignment = 0) const override;
|
|
//! Sampler object allocation
|
|
virtual bool createSampler(const amd::Sampler& owner, //!< abstraction layer sampler object
|
|
device::Sampler** sampler //!< device sampler object
|
|
) const override;
|
|
|
|
//! Just returns nullptr for the dummy device
|
|
virtual device::Memory* createView(
|
|
amd::Memory& owner, //!< Owner memory object
|
|
const device::Memory& parent //!< Parent device memory object for the view
|
|
) const override {
|
|
return nullptr;
|
|
}
|
|
|
|
virtual device::Signal* createSignal() const override;
|
|
|
|
//! Acquire external graphics API object in the host thread
|
|
//! Needed for OpenGL objects on CPU device
|
|
virtual bool bindExternalDevice(uint flags, void* const pDevice[], void* pContext,
|
|
bool validateOnly) override;
|
|
|
|
/**
|
|
* @brief Removes the external device as an available device.
|
|
*
|
|
* @note: The current implementation is to avoid build break
|
|
* and does not represent actual / correct implementation. This
|
|
* needs to be done.
|
|
*/
|
|
bool unbindExternalDevice(
|
|
uint flags, //!< Enum val. for ext.API type: GL, D3D10, etc.
|
|
void* const gfxDevice[], //!< D3D device do D3D, HDC/Display handle of X Window for GL
|
|
void* gfxContext, //!< HGLRC/GLXContext handle
|
|
bool validateOnly //!< Only validate if the device can inter-operate with
|
|
//!< pDevice/pContext, do not bind.
|
|
) override;
|
|
|
|
//! Gets free memory on a GPU device
|
|
virtual bool globalFreeMemory(size_t* freeMemory) const override;
|
|
virtual void* hostAlloc(size_t size, size_t alignment,
|
|
MemorySegment mem_seg = MemorySegment::kNoAtomics,
|
|
const void* agentInfo = nullptr) const override; // nullptr uses default CPU agent
|
|
virtual void hostFree(void* ptr, size_t size = 0) const override;
|
|
|
|
virtual bool amdFileRead(amd::Os::FileDesc handle, void* devicePtr, uint64_t size, int64_t file_offset,
|
|
uint64_t* size_copied, int32_t* status) override;
|
|
virtual bool amdFileWrite(amd::Os::FileDesc handle, void* devicePtr, uint64_t size, int64_t file_offset,
|
|
uint64_t* size_copied, int32_t* status) override;
|
|
|
|
bool deviceAllowAccess(void* dst) const override;
|
|
|
|
bool allowPeerAccess(device::Memory* memory) const override;
|
|
void deviceVmemRelease(uint64_t mem_handle) const;
|
|
uint64_t deviceVmemAlloc(size_t size, uint64_t flags) const;
|
|
|
|
void* deviceLocalAlloc(size_t size,
|
|
const AllocationFlags& flags = AllocationFlags{}) const override;
|
|
void* reserveMemory(size_t size, size_t alignment) const;
|
|
void releaseMemory(void* ptr, size_t size) const;
|
|
void memFree(void* ptr, size_t size) const;
|
|
|
|
virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment,
|
|
cl_svm_mem_flags flags = CL_MEM_READ_WRITE, void* svmPtr = nullptr) const override;
|
|
|
|
virtual void svmFree(void* ptr) const override;
|
|
|
|
virtual bool SetSvmAttributes(const void* dev_ptr, size_t count, amd::MemoryAdvice advice,
|
|
bool use_cpu = false, int numa_id = kDefaultNumaNode) const override;
|
|
virtual bool GetSvmAttributes(void** data, size_t* data_sizes, int* attributes,
|
|
size_t num_attributes, const void* dev_ptr, size_t count) const override;
|
|
virtual size_t ScratchLimitCurrent() const final;
|
|
virtual bool UpdateScratchLimitCurrent(size_t limit) const final;
|
|
virtual void* virtualAlloc(void* req_addr, size_t size, size_t alignment) override;
|
|
virtual bool virtualFree(void* addr) override;
|
|
|
|
virtual bool SetMemAccess(void* va_addr, size_t va_size, VmmAccess access_flags,
|
|
VmmLocationType = VmmLocationType::kDevice) override;
|
|
virtual bool GetMemAccess(void* va_addr, VmmAccess* access_flags_ptr) const override;
|
|
virtual bool ValidateMemAccess(amd::Memory& mem, bool read_write) const override { return true; }
|
|
|
|
virtual bool ExportShareableVMMHandle(amd::Memory& amd_mem_obj, int flags, void* shareableHandle) override;
|
|
|
|
bool ImportShareableHSAHandle(void* osHandle, uint64_t* hsa_handle_ptr) const;
|
|
|
|
virtual amd::Memory* ImportShareableVMMHandle(void* osHandle) override;
|
|
|
|
virtual bool SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput,
|
|
cl_set_device_clock_mode_output_amd* pSetClockModeOutput) override;
|
|
|
|
virtual bool IsHwEventReady(const amd::Event& event, bool wait = false,
|
|
amd::SyncPolicy policy = amd::SyncPolicy::Auto) const override;
|
|
virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const override;
|
|
virtual void ReleaseGlobalSignal(void* signal) const override;
|
|
virtual bool CreateUserEvent(amd::UserEvent* event) const override;
|
|
virtual void SetUserEvent(amd::UserEvent* event) const override;
|
|
|
|
//! Allocate host memory in terms of numa policy set by user
|
|
void* hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const;
|
|
|
|
//! Pin a host pointer allocated by C/C++ or OS allocator (i.e. ordinary system DRAM) and
|
|
//! return a new device pointer accessible by the GPU agent.
|
|
void* hostLock(void* hostMem, size_t size, MemorySegment memSegment) const;
|
|
|
|
//! Returns transfer engine object
|
|
const device::BlitManager& xferMgr() const { return xferQueue()->blitMgr(); }
|
|
|
|
const size_t alloc_granularity() const { return alloc_granularity_; }
|
|
|
|
const hsa_profile_t agent_profile() const { return agent_profile_; }
|
|
|
|
//! Finds an appropriate map target
|
|
amd::Memory* findMapTarget(size_t size) const;
|
|
|
|
//! Adds a map target to the cache
|
|
bool addMapTarget(amd::Memory* memory) const;
|
|
|
|
//! Returns a ROC memory object from AMD memory object
|
|
roc::Memory* getRocMemory(amd::Memory* mem //!< Pointer to AMD memory object
|
|
) const;
|
|
|
|
//! Create internal blit program
|
|
bool createBlitProgram();
|
|
|
|
// P2P agents avaialble for this device
|
|
const std::vector<hsa_agent_t>& p2pAgents() const { return p2p_agents_; }
|
|
|
|
//! Returns the list of HSA agents used for IPC memory attach
|
|
const hsa_agent_t* IpcAgents() const { return p2p_agents_list_; }
|
|
|
|
// User enabled peer devices
|
|
const bool isP2pEnabled() const { return (enabled_p2p_devices_.size() > 0) ? true : false; }
|
|
|
|
// Update the global free memory size
|
|
void updateFreeMemory(size_t size, bool free);
|
|
|
|
//! Returns the lock object for the virtual gpus list
|
|
amd::Monitor& vgpusAccess() const { return vgpusAccess_; }
|
|
|
|
typedef std::vector<VirtualGPU*> VirtualGPUs;
|
|
//! Returns the list of all virtual GPUs running on this device
|
|
const VirtualGPUs& vgpus() const { return vgpus_; }
|
|
VirtualGPUs vgpus_; //!< The list of all running virtual gpus (lock protected)
|
|
|
|
VirtualGPU* xferQueue() const;
|
|
|
|
//! Acquire HSA queue. This method can create a new HSA queue or
|
|
hsa_queue_t* acquireQueue(
|
|
uint32_t queue_size_hint, bool coop_queue = false, const std::vector<uint32_t>& cuMask = {},
|
|
amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal,
|
|
bool managed = false, bool dedicated_queue = false);
|
|
|
|
//! Release HSA queue
|
|
void releaseQueue(hsa_queue_t*, const std::vector<uint32_t>& cuMask = {}, bool coop_queue = false,
|
|
bool managed = false);
|
|
|
|
hsa_queue_t* AcquireActiveQueue(amd::CommandQueue::Priority priority);
|
|
bool ReleaseActiveQueue(hsa_queue_t* queue, amd::CommandQueue::Priority priority);
|
|
|
|
//! For the given HSA queue, return an existing hostcall buffer or create a
|
|
//! new one. queuePool_ keeps a mapping from HSA queue to hostcall buffer.
|
|
void* getOrCreateHostcallBuffer(hsa_queue_t* queue, bool coop_queue = false,
|
|
const std::vector<uint32_t>& cuMask = {});
|
|
|
|
//! Return multi GPU grid launch sync buffer
|
|
address MGSync() const { return mg_sync_; }
|
|
|
|
//! Returns value for corresponding Link Attributes in a vector, given other device
|
|
virtual bool findLinkInfo(const amd::Device& other_device, std::vector<LinkAttrType>* link_attr) override;
|
|
|
|
//! Returns a GPU memory object from AMD memory object
|
|
roc::Memory* getGpuMemory(amd::Memory* mem //!< Pointer to AMD memory object
|
|
) const;
|
|
|
|
//! Initialize memory in AMD HMM on the current device or keeps it in the host memory
|
|
bool SvmAllocInit(void* memory, size_t size) const;
|
|
|
|
void getGlobalCUMask(std::string cuMaskStr);
|
|
|
|
static hsa_status_t BackendErrorCallBackHandler(const hsa_amd_event_t* event, void* data);
|
|
|
|
static void RegisterBackendErrorCb();
|
|
|
|
virtual amd::Memory* GetArenaMemObj(const void* ptr, size_t& offset, size_t size = 0) override;
|
|
|
|
virtual uint32_t getPreferredNumaNode() const final { return preferred_numa_node_; }
|
|
|
|
const bool isFineGrainSupported() const override;
|
|
|
|
//! Returns True if memory pointer is known to ROCr (excludes HMM allocations)
|
|
bool IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer_info_t* ptr_info);
|
|
|
|
//! Allocates hidden heap for device memory allocations
|
|
void HiddenHeapAlloc(const VirtualGPU& gpu);
|
|
//! Init hidden heap for device memory allocations
|
|
void HiddenHeapInit(const VirtualGPU& gpu);
|
|
bool isXgmi() const override { return isXgmi_; }
|
|
|
|
//! SDMA engine allocation for per-stream affinity
|
|
uint32_t AllocateSdmaEngine(VirtualGPU* vgpu, HwQueueEngine engine_type,
|
|
hsa_agent_t dstAgent, hsa_agent_t srcAgent) const {
|
|
return sdma_engine_allocator_.AllocateEngine(vgpu, engine_type, dstAgent, srcAgent);
|
|
}
|
|
void ReleaseSdmaEngine(VirtualGPU* vgpu) const {
|
|
sdma_engine_allocator_.ReleaseEngine(vgpu);
|
|
}
|
|
//! Returns the map of code objects to kernels
|
|
const auto& KernelMap() const { return kernel_map_; }
|
|
//! Adds a kernel to the kernel map
|
|
void AddKernel(Kernel& gpuKernel) const;
|
|
//! Removes a kernel from the kernel map
|
|
void RemoveKernel(Kernel& gpuKernel) const;
|
|
|
|
// Returns the number of allocated queues for a given priority on this device
|
|
uint32_t NumQueues(uint qIndex) const { return num_queues_[qIndex].load(); }
|
|
|
|
//! enum for keeping the total and available queue priorities
|
|
enum QueuePriority : uint { Low = 0, Normal = 1, High = 2, Total = 3 };
|
|
|
|
//! Returns true if PM4 emulation is enabled
|
|
bool IsPm4Emulation() const { return pm4_emulation_; }
|
|
|
|
private:
|
|
bool create();
|
|
|
|
//! Construct a new physical HSA device
|
|
Device(hsa_agent_t bkendDevice);
|
|
|
|
static constexpr int kDefaultNumaNode = -1;
|
|
|
|
bool SetSvmAttributesInt(const void* dev_ptr, size_t count, amd::MemoryAdvice advice,
|
|
bool first_alloc = false, bool use_cpu = false,
|
|
int numa_id = kDefaultNumaNode) const;
|
|
static constexpr hsa_signal_value_t InitSignalValue = 1;
|
|
|
|
static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table;
|
|
|
|
amd::Monitor* mapCacheOps_; //!< Lock to serialise cache for the map resources
|
|
std::vector<amd::Memory*>* mapCache_; //!< Map cache info structure
|
|
|
|
bool populateOCLDeviceConstants();
|
|
static bool isHsaInitialized_;
|
|
static std::vector<hsa_agent_t> gpu_agents_;
|
|
static std::vector<AgentInfo> cpu_agents_;
|
|
uint32_t preferred_numa_node_;
|
|
std::vector<hsa_agent_t> p2p_agents_; //!< List of P2P agents available for this device
|
|
mutable std::mutex lock_allow_access_; //!< To serialize allow_access calls
|
|
hsa_agent_t bkendDevice_;
|
|
uint32_t pciDeviceId_;
|
|
hsa_agent_t* p2p_agents_list_ = nullptr;
|
|
hsa_profile_t agent_profile_;
|
|
hsa_amd_memory_pool_t group_segment_;
|
|
|
|
AgentInfo* cpu_agent_info_;
|
|
|
|
hsa_amd_memory_pool_t gpuvm_segment_;
|
|
hsa_amd_memory_pool_t gpu_fine_grained_segment_;
|
|
hsa_amd_memory_pool_t gpu_ext_fine_grained_segment_;
|
|
hsa_signal_t prefetch_signal_; //!< Prefetch signal, used to explicitly prefetch SVM on device
|
|
std::atomic<int> cache_state_; //!< State of cache, kUnknown/kFlushedToDevice/kFlushedToSystem
|
|
|
|
size_t gpuvm_segment_max_alloc_;
|
|
size_t alloc_granularity_;
|
|
static constexpr bool offlineDevice_ = false;
|
|
VirtualGPU* xferQueue_; //!< Transfer queue, created on demand
|
|
|
|
std::atomic<size_t> freeMem_; //!< Total of free memory available
|
|
mutable amd::Monitor vgpusAccess_; //!< Lock to serialise virtual gpu list access
|
|
bool hsa_exclusive_gpu_access_; //!< TRUE if current device was moved into exclusive GPU access
|
|
//!< mode
|
|
static address mg_sync_; //!< MGPU grid launch sync memory (SVM location)
|
|
|
|
struct QueueInfo {
|
|
int refCount; //! Reference counter. Shows how many time the queue was shared
|
|
void* hostcallBuffer_; //! Host call buffer for the HSA queue
|
|
bool hasDedicatedQueue_; //! True if this queue is a dedicated queue (e.g., null stream)
|
|
|
|
// Constructor
|
|
QueueInfo() : refCount(0), hostcallBuffer_(nullptr), hasDedicatedQueue_(false) {}
|
|
|
|
//! Get the current hardware queue depth (wptr - rptr)
|
|
static uint64_t GetHwQueueDepth(hsa_queue_t* queue) {
|
|
uint64_t wptr = Hsa::queue_load_write_index_relaxed(queue);
|
|
uint64_t rptr = Hsa::queue_load_read_index_relaxed(queue);
|
|
return wptr - rptr;
|
|
}
|
|
|
|
//! Get a combined metric for queue selection (lower is better)
|
|
uint64_t GetLoadMetric(hsa_queue_t* queue, uint32_t mode = 1) const {
|
|
auto depth = GetHwQueueDepth(queue);
|
|
|
|
// Dedicated queue penalty: prefer regular queues, but use dedicated if regular queues
|
|
// have depth > ~128 packets. Penalty = 128 << 4 = 2048.
|
|
uint64_t dedicated_queue_penalty = hasDedicatedQueue_ ? 2048 : 0;
|
|
|
|
// Advanced weighted metric: Give queue depth significantly more weight than refCount
|
|
uint64_t metric = dedicated_queue_penalty + (depth << 4) + static_cast<uint64_t>(refCount);
|
|
return metric;
|
|
}
|
|
};
|
|
|
|
struct QueueCompare {
|
|
const Device* device_;
|
|
|
|
QueueCompare(const Device* dev = nullptr) : device_(dev) {}
|
|
|
|
// Customized queue compare operator to make sure the queues are sorted in the creation order
|
|
bool operator()(hsa_queue_t* lhs, hsa_queue_t* rhs) const {
|
|
if (device_ != nullptr && device_->settings().dynamic_queues_ > 0) {
|
|
return (lhs->id < rhs->id) ? true : false;
|
|
} else {
|
|
return (lhs < rhs) ? true : false;
|
|
}
|
|
}
|
|
};
|
|
//! a vector for keeping Pool of HSA queues with low, normal and high priorities for recycling
|
|
std::vector<std::map<hsa_queue_t*, QueueInfo, QueueCompare>> queuePool_;
|
|
amd::Monitor active_queue_access_; //!< Lock to serialise virtual gpu list access
|
|
std::atomic<uint32_t> num_queues_[QueuePriority::Total] = {}; //!< Per-priority queue counters
|
|
|
|
//! Use dynamic queues mode to get a queue from pool
|
|
hsa_queue_t* getQueueFromPool(const uint qIndex, bool force_reuse = false);
|
|
|
|
void* coopHostcallBuffer_;
|
|
//! returns value for corresponding LinkAttrbutes in a vector given Memory pool.
|
|
virtual bool findLinkInfo(const hsa_amd_memory_pool_t& pool,
|
|
std::vector<LinkAttrType>* link_attr);
|
|
|
|
//! Pool of HSA queues with custom CU masks
|
|
std::vector<std::map<hsa_queue_t*, QueueInfo, QueueCompare>> queueWithCUMaskPool_;
|
|
hsa_amd_memory_pool_t getHostMemoryPool(MemorySegment mem_seg,
|
|
const AgentInfo* agentInfo = nullptr) const;
|
|
//! Read and Write mask for device<->host
|
|
uint32_t maxSdmaReadMask_;
|
|
uint32_t maxSdmaWriteMask_;
|
|
bool isXgmi_; //!< Flag to indicate if there is XGMI between CPU<->GPU
|
|
bool pm4_emulation_ = false; //!< Flag to indicate if PM4 emulation is enabled
|
|
uint32_t numHwPipes_; //!< Number of hardware pipes
|
|
|
|
//! SDMA engine allocator for per-stream affinity
|
|
struct SdmaEngineAllocator {
|
|
amd::Monitor lock_; //!< Protects the allocation state
|
|
std::unordered_map<VirtualGPU*, uint32_t> vgpu_to_engine_; //!< VirtualGPU -> engine mask
|
|
std::atomic<uint32_t> next_rr_engine_{0}; //!< Simple RR counter for future use
|
|
const Device& device_; //!< Reference to parent device for accessing masks
|
|
|
|
SdmaEngineAllocator(const Device& device)
|
|
: lock_(true), device_(device) {}
|
|
|
|
//! Allocate an SDMA engine for a VirtualGPU
|
|
//! Queries HSA for engine status and preferred engines, then allocates
|
|
//! For inter-GPU copies, strongly prefers recommended engines even if already allocated
|
|
uint32_t AllocateEngine(VirtualGPU* vgpu, HwQueueEngine engine_type,
|
|
hsa_agent_t dstAgent, hsa_agent_t srcAgent);
|
|
|
|
//! Release engine allocation for a VirtualGPU
|
|
void ReleaseEngine(VirtualGPU* vgpu);
|
|
};
|
|
mutable SdmaEngineAllocator sdma_engine_allocator_;
|
|
|
|
//! Code object to kernel info map (used in the crash dump analysis)
|
|
mutable std::map<uint64_t, Kernel&> kernel_map_;
|
|
|
|
//! Friend function callbackQueue can access and set device class variables.
|
|
friend void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data);
|
|
|
|
public:
|
|
std::atomic<uint> numOfVgpus_; //!< Virtual gpu unique index
|
|
|
|
#if defined(__clang__)
|
|
#if __has_feature(address_sanitizer)
|
|
virtual device::UriLocator* createUriLocator() const;
|
|
#endif
|
|
#endif
|
|
}; // class roc::Device
|
|
|
|
void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data);
|
|
|
|
} // namespace amd::roc
|
|
|
|
/**
|
|
* @}
|
|
*/
|