Files
SaleelK 340f3aa887 clr: Implement dynamic stream to HWq logic (#1958)
* clr: Implement dynamic stream to HW queue assignment

This change implements dynamic stream to hardware queue (HWq) mapping
with the following features:

* Queue depth heuristics with weights for optimal HWq assignment
* Make last used queue sticky for better locality
* Use pipe HWq to pipe mapping - gfx9 follows a round-robin queue to
  pipe mapping based on creation order (single process per device only,
  as pipe ID is statically assigned by runtime)
* More aggressive heuristic usage for better queue distribution
* Extend dynamic queues support for all stream priorities

Environment variables:
* DEBUG_HIP_DYNAMIC_QUEUE: 0 - disabled, 1 - Depth heuristics 2 -
  Depth+Pipe heuristics
* DEBUG_HIP_IGNORE_STREAM_PRIORITY=1: ignore priority stream creation

* clr: Clean up last_used_queue_
2026-01-23 10:40:54 -08:00

791 строка
30 KiB
C++

/* Copyright (c) 2009 - 2025 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#pragma once
#include "top.hpp"
#include "CL/cl.h"
#include "device/device.hpp"
#include "platform/command.hpp"
#include "platform/program.hpp"
#include "platform/perfctr.hpp"
#include "platform/memory.hpp"
#include "utils/concurrent.hpp"
#include "thread/thread.hpp"
#include "thread/monitor.hpp"
#include "utils/versions.hpp"
#include "device/rocm/rocrctx.hpp"
#include "device/rocm/rocsettings.hpp"
#include "device/rocm/rocvirtual.hpp"
#include "device/rocm/rocdefs.hpp"
#include "device/rocm/rocprintf.hpp"
#include "device/rocm/rocglinterop.hpp"
#include <atomic>
#include <iostream>
#include <vector>
#include <memory>
/*! \addtogroup HSA
* @{
*/
//! HSA Device Implementation
namespace amd::roc {
/**
* @brief List of environment variables that could be used to
* configure the behavior of Hsa Runtime
*/
#define ENVVAR_HSA_POLL_KERNEL_COMPLETION "HSA_POLL_COMPLETION"
//! Forward declarations
class Command;
class Device;
class GpuCommand;
class Heap;
class HeapBlock;
class Program;
class Kernel;
class Memory;
class Resource;
class VirtualDevice;
class PrintfDbg;
class ProfilingSignal : public amd::ReferenceCountedObject {
public:
hsa_signal_t signal_; //!< HSA signal to track profiling information
Timestamp* ts_; //!< Timestamp object associated with the signal
HwQueueEngine engine_; //!< Engine used with this signal
amd::Monitor lock_; //!< Signal lock for update
typedef union {
struct {
uint32_t done_ : 1; //!< True if signal is done
uint32_t isPacketDispatch_ : 1; //!< True if the packet, used with the signal, is dispatch
uint32_t interrupt_ : 1; //!< True if the signal will trigger an interrupt
uint32_t reserved_ : 29;
};
uint32_t data_;
} Flags;
Flags flags_;
//! Cached timing data - populated when signal completes, avoids repeated HSA calls
struct CachedTiming {
uint64_t start_ = 0; //!< Cached start timestamp from HSA
uint64_t end_ = 0; //!< Cached end timestamp from HSA
bool valid_ = false; //!< True if timing data has been cached
};
CachedTiming cached_timing_;
ProfilingSignal()
: ts_(nullptr),
engine_(HwQueueEngine::Compute),
lock_(true) /* Signal Ops Lock */
{
signal_.handle = 0;
flags_.data_ = 0;
flags_.done_ = true;
}
virtual ~ProfilingSignal();
amd::Monitor& LockSignalOps() { return lock_; }
//! Cache timing data from HSA for this signal (called once when signal completes)
void CacheTimingData(hsa_agent_t gpu_device);
//! Reset cached timing for signal reuse
void ResetCachedTiming() {
amd::ScopedLock lock(lock_);
cached_timing_.start_ = 0;
cached_timing_.end_ = 0;
cached_timing_.valid_ = false;
}
//! Check if timing is already cached
bool IsTimingCached() const { return cached_timing_.valid_; }
//! Get cached timing values
void GetCachedTiming(uint64_t& start, uint64_t& end) {
amd::ScopedLock lock(lock_);
start = cached_timing_.start_;
end = cached_timing_.end_;
}
};
class Sampler : public device::Sampler {
public:
//! Constructor
Sampler(const Device& dev) : dev_(dev) {}
//! Default destructor for the device memory object
virtual ~Sampler();
//! Creates a device sampler from the OCL sampler state
bool create(const amd::Sampler& owner //!< AMD sampler object
);
private:
void fillSampleDescriptor(hsa_ext_sampler_descriptor_v2_t& samplerDescriptor,
const amd::Sampler& sampler) const;
Sampler& operator=(const Sampler&);
//! Disable operator=
Sampler(const Sampler&);
const Device& dev_; //!< Device object associated with the sampler
hsa_ext_sampler_t hsa_sampler;
};
// A NULL Device type used only for offline compilation
// Only functions that are used for compilation will be in this device
class NullDevice : public amd::Device {
public:
//! constructor
NullDevice(){};
//! create the device
bool create(const amd::Isa& isa);
//! Initialise all the offline devices that can be used for compilation
static bool init();
//! Teardown for offline devices
static void tearDown();
//! Destructor for the Null device
virtual ~NullDevice();
const Settings& settings() const { return static_cast<Settings&>(*settings_); }
//! Construct an device program object from the ELF assuming it is valid
device::Program* createProgram(amd::Program& owner,
amd::option::Options* options = nullptr) override;
// List of dummy functions which are disabled for NullDevice
//! Create a new virtual device environment.
device::VirtualDevice* createVirtualDevice(amd::CommandQueue* queue = nullptr) override {
ShouldNotReachHere();
return nullptr;
}
virtual bool registerSvmMemory(void* ptr, size_t size) const {
ShouldNotReachHere();
return false;
}
virtual void deregisterSvmMemory(void* ptr) const { ShouldNotReachHere(); }
//! Just returns nullptr for the dummy device
device::Memory* createMemory(amd::Memory& owner) const override {
ShouldNotReachHere();
return nullptr;
}
device::Memory* createMemory(size_t size, size_t alignment = 0) const override {
ShouldNotReachHere();
return nullptr;
}
//! Sampler object allocation
bool createSampler(const amd::Sampler& owner, //!< abstraction layer sampler object
device::Sampler** sampler //!< device sampler object
) const override {
ShouldNotReachHere();
return true;
}
//! Just returns nullptr for the dummy device
device::Memory* createView(
amd::Memory& owner, //!< Owner memory object
const device::Memory& parent //!< Parent device memory object for the view
) const override {
ShouldNotReachHere();
return nullptr;
}
device::Signal* createSignal() const override {
ShouldNotReachHere();
return nullptr;
}
//! Just returns nullptr for the dummy device
void* svmAlloc(amd::Context& context, //!< The context used to create a buffer
size_t size, //!< size of svm spaces
size_t alignment, //!< alignment requirement of svm spaces
cl_svm_mem_flags flags, //!< flags of creation svm spaces
void* svmPtr //!< existing svm pointer for mGPU case
) const override {
ShouldNotReachHere();
return nullptr;
}
//! Just returns nullptr for the dummy device
void svmFree(void* ptr //!< svm pointer needed to be freed
) const override {
ShouldNotReachHere();
return;
}
void* virtualAlloc(void* req_addr, size_t size, size_t alignment) override {
ShouldNotReachHere();
return nullptr;
}
bool virtualFree(void* addr) override {
ShouldNotReachHere();
return true;
}
virtual bool SetMemAccess(void* va_addr, size_t va_size, VmmAccess access_flags,
VmmLocationType = VmmLocationType::kDevice) override {
ShouldNotReachHere();
return false;
}
virtual bool GetMemAccess(void* va_addr, VmmAccess* access_flags_ptr) const override {
ShouldNotReachHere();
return false;
}
virtual bool ValidateMemAccess(amd::Memory& mem, bool read_write) const override {
ShouldNotReachHere();
return true;
}
//! Determine if we can use device memory for SVM
const bool forceFineGrain(amd::Memory* memory) const {
return (memory->getContext().devices().size() > 1);
}
virtual bool importExtSemaphore(void** extSemahore, const amd::Os::FileDesc& handle,
amd::ExternalSemaphoreHandleType sem_handle_type) override {
ShouldNotReachHere();
return false;
}
void DestroyExtSemaphore(void* extSemaphore) override { ShouldNotReachHere(); }
//! Acquire external graphics API object in the host thread
//! Needed for OpenGL objects on CPU device
bool bindExternalDevice(uint flags, void* const pDevice[], void* pContext,
bool validateOnly) override {
ShouldNotReachHere();
return false;
}
bool unbindExternalDevice(uint flags, void* const pDevice[], void* pContext,
bool validateOnly) override {
ShouldNotReachHere();
return false;
}
//! Releases non-blocking map target memory
virtual void freeMapTarget(amd::Memory& mem, void* target) { ShouldNotReachHere(); }
//! Empty implementation on Null device
bool globalFreeMemory(size_t* freeMemory) const override {
ShouldNotReachHere();
return false;
}
//! Empty implementation on Null device
bool amdFileRead(amd::Os::FileDesc handle, void* devicePtr, uint64_t size, int64_t file_offset,
uint64_t* size_copied, int32_t* status) override {
ShouldNotReachHere();
return false;
}
//! Empty implementation on Null device
bool amdFileWrite(amd::Os::FileDesc handle, void* devicePtr, uint64_t size, int64_t file_offset,
uint64_t* size_copied, int32_t* status) override {
ShouldNotReachHere();
return false;
}
bool SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput,
cl_set_device_clock_mode_output_amd* pSetClockModeOutput) override {
return true;
}
bool IsHwEventReady(const amd::Event& event, bool wait = false,
amd::SyncPolicy policy = amd::SyncPolicy::Auto) const override {
return false;
}
void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const override {};
void ReleaseGlobalSignal(void* signal) const override {}
#if defined(__clang__)
#if __has_feature(address_sanitizer)
virtual device::UriLocator* createUriLocator() const {
ShouldNotReachHere();
return nullptr;
}
#endif
#endif
private:
static constexpr bool offlineDevice_ = true;
};
struct AgentInfo {
hsa_agent_t agent;
hsa_amd_memory_pool_t fine_grain_pool;
hsa_amd_memory_pool_t coarse_grain_pool;
hsa_amd_memory_pool_t kern_arg_pool;
hsa_amd_memory_pool_t ext_fine_grain_pool;
};
//! A HSA device ordinal (physical HSA device)
class Device : public NullDevice {
public:
//! Initialise the whole HSA device subsystem (init, device enumeration, etc).
static bool init();
static void tearDown();
//! Lookup all AMD HSA devices and memory regions.
static hsa_status_t iterateAgentCallback(hsa_agent_t agent, void* data);
static hsa_status_t iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t region, void* data);
static hsa_status_t iterateCpuMemoryPoolCallback(hsa_amd_memory_pool_t region, void* data);
static hsa_status_t loaderQueryHostAddress(const void* device, const void** host);
static bool loadHsaModules();
hsa_agent_t getBackendDevice() const { return bkendDevice_; }
//! Get the CPU agent with the least NUMA distance to this GPU
const hsa_agent_t& getCpuAgent() const { return cpu_agent_info_->agent; }
//! Get the CPU agent that is in a 'index' NUMA node
const hsa_agent_t getCpuAgent(int index) const {
if ((index < 0) || (index >= cpu_agents_.size())) {
// Return default CPU agent
return cpu_agent_info_->agent;
}
return cpu_agents_[index].agent;
}
void setupCpuAgent(); // Setup the CPU agent which has the least NUMA distance to this GPU
void checkAtomicSupport(); //!< Check the support for pcie atomics
//! Destructor for the physical HSA device
virtual ~Device();
// Temporary, delete it later when HSA Runtime and KFD is fully fucntional.
void fake_device();
///////////////////////////////////////////////////////////////////////////////
// TODO: Below are all mocked up virtual functions from amd::Device, they may
// need real implementation.
///////////////////////////////////////////////////////////////////////////////
//! Instantiate a new virtual device
virtual device::VirtualDevice* createVirtualDevice(amd::CommandQueue* queue = nullptr) override;
//! Construct an device program object from the ELF assuming it is valid
virtual device::Program* createProgram(amd::Program& owner,
amd::option::Options* options = nullptr) override;
virtual device::Memory* createMemory(amd::Memory& owner) const override;
virtual device::Memory* createMemory(size_t size, size_t alignment = 0) const override;
//! Sampler object allocation
virtual bool createSampler(const amd::Sampler& owner, //!< abstraction layer sampler object
device::Sampler** sampler //!< device sampler object
) const override;
//! Just returns nullptr for the dummy device
virtual device::Memory* createView(
amd::Memory& owner, //!< Owner memory object
const device::Memory& parent //!< Parent device memory object for the view
) const override {
return nullptr;
}
virtual device::Signal* createSignal() const override;
//! Acquire external graphics API object in the host thread
//! Needed for OpenGL objects on CPU device
virtual bool bindExternalDevice(uint flags, void* const pDevice[], void* pContext,
bool validateOnly) override;
/**
* @brief Removes the external device as an available device.
*
* @note: The current implementation is to avoid build break
* and does not represent actual / correct implementation. This
* needs to be done.
*/
bool unbindExternalDevice(
uint flags, //!< Enum val. for ext.API type: GL, D3D10, etc.
void* const gfxDevice[], //!< D3D device do D3D, HDC/Display handle of X Window for GL
void* gfxContext, //!< HGLRC/GLXContext handle
bool validateOnly //!< Only validate if the device can inter-operate with
//!< pDevice/pContext, do not bind.
) override;
//! Gets free memory on a GPU device
virtual bool globalFreeMemory(size_t* freeMemory) const override;
virtual void* hostAlloc(size_t size, size_t alignment,
MemorySegment mem_seg = MemorySegment::kNoAtomics,
const void* agentInfo = nullptr) const override; // nullptr uses default CPU agent
virtual void hostFree(void* ptr, size_t size = 0) const override;
virtual bool amdFileRead(amd::Os::FileDesc handle, void* devicePtr, uint64_t size, int64_t file_offset,
uint64_t* size_copied, int32_t* status) override;
virtual bool amdFileWrite(amd::Os::FileDesc handle, void* devicePtr, uint64_t size, int64_t file_offset,
uint64_t* size_copied, int32_t* status) override;
bool deviceAllowAccess(void* dst) const override;
bool allowPeerAccess(device::Memory* memory) const override;
void deviceVmemRelease(uint64_t mem_handle) const;
uint64_t deviceVmemAlloc(size_t size, uint64_t flags) const;
void* deviceLocalAlloc(size_t size,
const AllocationFlags& flags = AllocationFlags{}) const override;
void* reserveMemory(size_t size, size_t alignment) const;
void releaseMemory(void* ptr, size_t size) const;
void memFree(void* ptr, size_t size) const;
virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment,
cl_svm_mem_flags flags = CL_MEM_READ_WRITE, void* svmPtr = nullptr) const override;
virtual void svmFree(void* ptr) const override;
virtual bool SetSvmAttributes(const void* dev_ptr, size_t count, amd::MemoryAdvice advice,
bool use_cpu = false, int numa_id = kDefaultNumaNode) const override;
virtual bool GetSvmAttributes(void** data, size_t* data_sizes, int* attributes,
size_t num_attributes, const void* dev_ptr, size_t count) const override;
virtual size_t ScratchLimitCurrent() const final;
virtual bool UpdateScratchLimitCurrent(size_t limit) const final;
virtual void* virtualAlloc(void* req_addr, size_t size, size_t alignment) override;
virtual bool virtualFree(void* addr) override;
virtual bool SetMemAccess(void* va_addr, size_t va_size, VmmAccess access_flags,
VmmLocationType = VmmLocationType::kDevice) override;
virtual bool GetMemAccess(void* va_addr, VmmAccess* access_flags_ptr) const override;
virtual bool ValidateMemAccess(amd::Memory& mem, bool read_write) const override { return true; }
virtual bool ExportShareableVMMHandle(amd::Memory& amd_mem_obj, int flags, void* shareableHandle) override;
bool ImportShareableHSAHandle(void* osHandle, uint64_t* hsa_handle_ptr) const;
virtual amd::Memory* ImportShareableVMMHandle(void* osHandle) override;
virtual bool SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput,
cl_set_device_clock_mode_output_amd* pSetClockModeOutput) override;
virtual bool IsHwEventReady(const amd::Event& event, bool wait = false,
amd::SyncPolicy policy = amd::SyncPolicy::Auto) const override;
virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const override;
virtual void ReleaseGlobalSignal(void* signal) const override;
virtual bool CreateUserEvent(amd::UserEvent* event) const override;
virtual void SetUserEvent(amd::UserEvent* event) const override;
//! Allocate host memory in terms of numa policy set by user
void* hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const;
//! Pin a host pointer allocated by C/C++ or OS allocator (i.e. ordinary system DRAM) and
//! return a new device pointer accessible by the GPU agent.
void* hostLock(void* hostMem, size_t size, MemorySegment memSegment) const;
//! Returns transfer engine object
const device::BlitManager& xferMgr() const { return xferQueue()->blitMgr(); }
const size_t alloc_granularity() const { return alloc_granularity_; }
const hsa_profile_t agent_profile() const { return agent_profile_; }
//! Finds an appropriate map target
amd::Memory* findMapTarget(size_t size) const;
//! Adds a map target to the cache
bool addMapTarget(amd::Memory* memory) const;
//! Returns a ROC memory object from AMD memory object
roc::Memory* getRocMemory(amd::Memory* mem //!< Pointer to AMD memory object
) const;
//! Create internal blit program
bool createBlitProgram();
// P2P agents avaialble for this device
const std::vector<hsa_agent_t>& p2pAgents() const { return p2p_agents_; }
//! Returns the list of HSA agents used for IPC memory attach
const hsa_agent_t* IpcAgents() const { return p2p_agents_list_; }
// User enabled peer devices
const bool isP2pEnabled() const { return (enabled_p2p_devices_.size() > 0) ? true : false; }
// Update the global free memory size
void updateFreeMemory(size_t size, bool free);
//! Returns the lock object for the virtual gpus list
amd::Monitor& vgpusAccess() const { return vgpusAccess_; }
typedef std::vector<VirtualGPU*> VirtualGPUs;
//! Returns the list of all virtual GPUs running on this device
const VirtualGPUs& vgpus() const { return vgpus_; }
VirtualGPUs vgpus_; //!< The list of all running virtual gpus (lock protected)
VirtualGPU* xferQueue() const;
//! Acquire HSA queue. This method can create a new HSA queue or
hsa_queue_t* acquireQueue(
uint32_t queue_size_hint, bool coop_queue = false, const std::vector<uint32_t>& cuMask = {},
amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal,
bool managed = false, bool dedicated_queue = false);
//! Release HSA queue
void releaseQueue(hsa_queue_t*, const std::vector<uint32_t>& cuMask = {}, bool coop_queue = false,
bool managed = false);
hsa_queue_t* AcquireActiveQueue(amd::CommandQueue::Priority priority);
bool ReleaseActiveQueue(hsa_queue_t* queue, amd::CommandQueue::Priority priority);
//! For the given HSA queue, return an existing hostcall buffer or create a
//! new one. queuePool_ keeps a mapping from HSA queue to hostcall buffer.
void* getOrCreateHostcallBuffer(hsa_queue_t* queue, bool coop_queue = false,
const std::vector<uint32_t>& cuMask = {});
//! Return multi GPU grid launch sync buffer
address MGSync() const { return mg_sync_; }
//! Returns value for corresponding Link Attributes in a vector, given other device
virtual bool findLinkInfo(const amd::Device& other_device, std::vector<LinkAttrType>* link_attr) override;
//! Returns a GPU memory object from AMD memory object
roc::Memory* getGpuMemory(amd::Memory* mem //!< Pointer to AMD memory object
) const;
//! Initialize memory in AMD HMM on the current device or keeps it in the host memory
bool SvmAllocInit(void* memory, size_t size) const;
void getGlobalCUMask(std::string cuMaskStr);
static hsa_status_t BackendErrorCallBackHandler(const hsa_amd_event_t* event, void* data);
static void RegisterBackendErrorCb();
virtual amd::Memory* GetArenaMemObj(const void* ptr, size_t& offset, size_t size = 0) override;
virtual uint32_t getPreferredNumaNode() const final { return preferred_numa_node_; }
const bool isFineGrainSupported() const override;
//! Returns True if memory pointer is known to ROCr (excludes HMM allocations)
bool IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer_info_t* ptr_info);
//! Allocates hidden heap for device memory allocations
void HiddenHeapAlloc(const VirtualGPU& gpu);
//! Init hidden heap for device memory allocations
void HiddenHeapInit(const VirtualGPU& gpu);
bool isXgmi() const override { return isXgmi_; }
//! SDMA engine allocation for per-stream affinity
uint32_t AllocateSdmaEngine(VirtualGPU* vgpu, HwQueueEngine engine_type,
hsa_agent_t dstAgent, hsa_agent_t srcAgent) const {
return sdma_engine_allocator_.AllocateEngine(vgpu, engine_type, dstAgent, srcAgent);
}
void ReleaseSdmaEngine(VirtualGPU* vgpu) const {
sdma_engine_allocator_.ReleaseEngine(vgpu);
}
//! Returns the map of code objects to kernels
const auto& KernelMap() const { return kernel_map_; }
//! Adds a kernel to the kernel map
void AddKernel(Kernel& gpuKernel) const;
//! Removes a kernel from the kernel map
void RemoveKernel(Kernel& gpuKernel) const;
// Returns the number of allocated queues for a given priority on this device
uint32_t NumQueues(uint qIndex) const { return num_queues_[qIndex].load(); }
//! enum for keeping the total and available queue priorities
enum QueuePriority : uint { Low = 0, Normal = 1, High = 2, Total = 3 };
//! Returns true if PM4 emulation is enabled
bool IsPm4Emulation() const { return pm4_emulation_; }
private:
bool create();
//! Construct a new physical HSA device
Device(hsa_agent_t bkendDevice);
static constexpr int kDefaultNumaNode = -1;
bool SetSvmAttributesInt(const void* dev_ptr, size_t count, amd::MemoryAdvice advice,
bool first_alloc = false, bool use_cpu = false,
int numa_id = kDefaultNumaNode) const;
static constexpr hsa_signal_value_t InitSignalValue = 1;
static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table;
amd::Monitor* mapCacheOps_; //!< Lock to serialise cache for the map resources
std::vector<amd::Memory*>* mapCache_; //!< Map cache info structure
bool populateOCLDeviceConstants();
static bool isHsaInitialized_;
static std::vector<hsa_agent_t> gpu_agents_;
static std::vector<AgentInfo> cpu_agents_;
uint32_t preferred_numa_node_;
std::vector<hsa_agent_t> p2p_agents_; //!< List of P2P agents available for this device
mutable std::mutex lock_allow_access_; //!< To serialize allow_access calls
hsa_agent_t bkendDevice_;
uint32_t pciDeviceId_;
hsa_agent_t* p2p_agents_list_ = nullptr;
hsa_profile_t agent_profile_;
hsa_amd_memory_pool_t group_segment_;
AgentInfo* cpu_agent_info_;
hsa_amd_memory_pool_t gpuvm_segment_;
hsa_amd_memory_pool_t gpu_fine_grained_segment_;
hsa_amd_memory_pool_t gpu_ext_fine_grained_segment_;
hsa_signal_t prefetch_signal_; //!< Prefetch signal, used to explicitly prefetch SVM on device
std::atomic<int> cache_state_; //!< State of cache, kUnknown/kFlushedToDevice/kFlushedToSystem
size_t gpuvm_segment_max_alloc_;
size_t alloc_granularity_;
static constexpr bool offlineDevice_ = false;
VirtualGPU* xferQueue_; //!< Transfer queue, created on demand
std::atomic<size_t> freeMem_; //!< Total of free memory available
mutable amd::Monitor vgpusAccess_; //!< Lock to serialise virtual gpu list access
bool hsa_exclusive_gpu_access_; //!< TRUE if current device was moved into exclusive GPU access
//!< mode
static address mg_sync_; //!< MGPU grid launch sync memory (SVM location)
struct QueueInfo {
int refCount; //! Reference counter. Shows how many time the queue was shared
void* hostcallBuffer_; //! Host call buffer for the HSA queue
bool hasDedicatedQueue_; //! True if this queue is a dedicated queue (e.g., null stream)
// Constructor
QueueInfo() : refCount(0), hostcallBuffer_(nullptr), hasDedicatedQueue_(false) {}
//! Get the current hardware queue depth (wptr - rptr)
static uint64_t GetHwQueueDepth(hsa_queue_t* queue) {
uint64_t wptr = Hsa::queue_load_write_index_relaxed(queue);
uint64_t rptr = Hsa::queue_load_read_index_relaxed(queue);
return wptr - rptr;
}
//! Get a combined metric for queue selection (lower is better)
uint64_t GetLoadMetric(hsa_queue_t* queue, uint32_t mode = 1) const {
auto depth = GetHwQueueDepth(queue);
// Dedicated queue penalty: prefer regular queues, but use dedicated if regular queues
// have depth > ~128 packets. Penalty = 128 << 4 = 2048.
uint64_t dedicated_queue_penalty = hasDedicatedQueue_ ? 2048 : 0;
// Advanced weighted metric: Give queue depth significantly more weight than refCount
uint64_t metric = dedicated_queue_penalty + (depth << 4) + static_cast<uint64_t>(refCount);
return metric;
}
};
struct QueueCompare {
const Device* device_;
QueueCompare(const Device* dev = nullptr) : device_(dev) {}
// Customized queue compare operator to make sure the queues are sorted in the creation order
bool operator()(hsa_queue_t* lhs, hsa_queue_t* rhs) const {
if (device_ != nullptr && device_->settings().dynamic_queues_ > 0) {
return (lhs->id < rhs->id) ? true : false;
} else {
return (lhs < rhs) ? true : false;
}
}
};
//! a vector for keeping Pool of HSA queues with low, normal and high priorities for recycling
std::vector<std::map<hsa_queue_t*, QueueInfo, QueueCompare>> queuePool_;
amd::Monitor active_queue_access_; //!< Lock to serialise virtual gpu list access
std::atomic<uint32_t> num_queues_[QueuePriority::Total] = {}; //!< Per-priority queue counters
//! Use dynamic queues mode to get a queue from pool
hsa_queue_t* getQueueFromPool(const uint qIndex, bool force_reuse = false);
void* coopHostcallBuffer_;
//! returns value for corresponding LinkAttrbutes in a vector given Memory pool.
virtual bool findLinkInfo(const hsa_amd_memory_pool_t& pool,
std::vector<LinkAttrType>* link_attr);
//! Pool of HSA queues with custom CU masks
std::vector<std::map<hsa_queue_t*, QueueInfo, QueueCompare>> queueWithCUMaskPool_;
hsa_amd_memory_pool_t getHostMemoryPool(MemorySegment mem_seg,
const AgentInfo* agentInfo = nullptr) const;
//! Read and Write mask for device<->host
uint32_t maxSdmaReadMask_;
uint32_t maxSdmaWriteMask_;
bool isXgmi_; //!< Flag to indicate if there is XGMI between CPU<->GPU
bool pm4_emulation_ = false; //!< Flag to indicate if PM4 emulation is enabled
uint32_t numHwPipes_; //!< Number of hardware pipes
//! SDMA engine allocator for per-stream affinity
struct SdmaEngineAllocator {
amd::Monitor lock_; //!< Protects the allocation state
std::unordered_map<VirtualGPU*, uint32_t> vgpu_to_engine_; //!< VirtualGPU -> engine mask
std::atomic<uint32_t> next_rr_engine_{0}; //!< Simple RR counter for future use
const Device& device_; //!< Reference to parent device for accessing masks
SdmaEngineAllocator(const Device& device)
: lock_(true), device_(device) {}
//! Allocate an SDMA engine for a VirtualGPU
//! Queries HSA for engine status and preferred engines, then allocates
//! For inter-GPU copies, strongly prefers recommended engines even if already allocated
uint32_t AllocateEngine(VirtualGPU* vgpu, HwQueueEngine engine_type,
hsa_agent_t dstAgent, hsa_agent_t srcAgent);
//! Release engine allocation for a VirtualGPU
void ReleaseEngine(VirtualGPU* vgpu);
};
mutable SdmaEngineAllocator sdma_engine_allocator_;
//! Code object to kernel info map (used in the crash dump analysis)
mutable std::map<uint64_t, Kernel&> kernel_map_;
//! Friend function callbackQueue can access and set device class variables.
friend void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data);
public:
std::atomic<uint> numOfVgpus_; //!< Virtual gpu unique index
#if defined(__clang__)
#if __has_feature(address_sanitizer)
virtual device::UriLocator* createUriLocator() const;
#endif
#endif
}; // class roc::Device
void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data);
} // namespace amd::roc
/**
* @}
*/