SWDEV-184709 - support hipLaunchCooperativeKernel()
Add ROCr cooperative queue allocation
Change-Id: I1384482692f4080d31255b09e0f68a21ccad3da8
[ROCm/clr commit: 7ef8dfdfe7]
This commit is contained in:
committed by
Aakash Sudhanwa
parent
782a76511b
commit
14aa72890e
@@ -1457,14 +1457,20 @@ device::VirtualDevice* Device::createVirtualDevice(amd::CommandQueue* queue) {
|
||||
amd::ScopedLock lock(vgpusAccess());
|
||||
|
||||
bool profiling = (queue != nullptr) && queue->properties().test(CL_QUEUE_PROFILING_ENABLE);
|
||||
bool cooperative = false;
|
||||
|
||||
profiling |= (queue == nullptr) ? true : false;
|
||||
// If amd command queue is null, then it's an internal device queue
|
||||
if (queue == nullptr) {
|
||||
// In HIP mode the device queue will be allocated for the cooperative launches only
|
||||
cooperative = amd::IS_HIP;
|
||||
profiling = amd::IS_HIP;
|
||||
}
|
||||
|
||||
// Initialization of heap and other resources occur during the command
|
||||
// queue creation time.
|
||||
VirtualGPU* virtualDevice = new VirtualGPU(*this);
|
||||
VirtualGPU* virtualDevice = new VirtualGPU(*this, profiling, cooperative);
|
||||
|
||||
if (!virtualDevice->create(profiling)) {
|
||||
if (!virtualDevice->create()) {
|
||||
delete virtualDevice;
|
||||
return nullptr;
|
||||
}
|
||||
@@ -1874,12 +1880,13 @@ VirtualGPU* Device::xferQueue() const {
|
||||
return xferQueue_;
|
||||
}
|
||||
|
||||
bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, cl_set_device_clock_mode_output_amd* pSetClockModeOutput) {
|
||||
bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput,
|
||||
cl_set_device_clock_mode_output_amd* pSetClockModeOutput) {
|
||||
bool result = true;
|
||||
return result;
|
||||
}
|
||||
|
||||
hsa_queue_t *Device::acquireQueue(uint32_t queue_size_hint) {
|
||||
hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue) {
|
||||
assert(queuePool_.size() <= GPU_MAX_HW_QUEUES);
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "number of allocated hardware queues: %d, maximum: %d",
|
||||
queuePool_.size(), GPU_MAX_HW_QUEUES);
|
||||
@@ -1907,8 +1914,15 @@ hsa_queue_t *Device::acquireQueue(uint32_t queue_size_hint) {
|
||||
}
|
||||
auto queue_size = (queue_max_packets < queue_size_hint) ? queue_max_packets : queue_size_hint;
|
||||
|
||||
hsa_queue_t *queue;
|
||||
while (hsa_queue_create(_bkendDevice, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr, nullptr,
|
||||
hsa_queue_t* queue;
|
||||
auto queue_type = HSA_QUEUE_TYPE_MULTI;
|
||||
|
||||
// Enable cooperative queue for the device queue
|
||||
if (coop_queue) {
|
||||
queue_type = HSA_QUEUE_TYPE_COOPERATIVE;
|
||||
}
|
||||
|
||||
while (hsa_queue_create(_bkendDevice, queue_size, queue_type, nullptr, nullptr,
|
||||
std::numeric_limits<uint>::max(), std::numeric_limits<uint>::max(),
|
||||
&queue) != HSA_STATUS_SUCCESS) {
|
||||
queue_size >>= 1;
|
||||
@@ -1919,6 +1933,11 @@ hsa_queue_t *Device::acquireQueue(uint32_t queue_size_hint) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "created hardware queue %p with size %d",
|
||||
queue, queue_size);
|
||||
hsa_amd_profiling_set_profiler_enabled(queue, 1);
|
||||
if (coop_queue) {
|
||||
// Skip queue recycling for cooperative queues, since it should be just one
|
||||
// per device.
|
||||
return queue;
|
||||
}
|
||||
auto result = queuePool_.emplace(std::make_pair(queue, QueueInfo()));
|
||||
assert(result.second && "QueueInfo already exists");
|
||||
auto &qInfo = result.first->second;
|
||||
@@ -1928,26 +1947,26 @@ hsa_queue_t *Device::acquireQueue(uint32_t queue_size_hint) {
|
||||
|
||||
void Device::releaseQueue(hsa_queue_t* queue) {
|
||||
auto qIter = queuePool_.find(queue);
|
||||
assert(qIter != queuePool_.end());
|
||||
if (qIter != queuePool_.end()) {
|
||||
auto &qInfo = qIter->second;
|
||||
assert(qInfo.refCount > 0);
|
||||
qInfo.refCount--;
|
||||
if (qInfo.refCount != 0) {
|
||||
return;
|
||||
}
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue);
|
||||
|
||||
auto &qInfo = qIter->second;
|
||||
assert(qInfo.refCount > 0);
|
||||
qInfo.refCount--;
|
||||
if (qInfo.refCount != 0) {
|
||||
return;
|
||||
if (qInfo.hostcallBuffer_) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hostcall buffer %p for hardware queue %p",
|
||||
qInfo.hostcallBuffer_, queue);
|
||||
disableHostcalls(qInfo.hostcallBuffer_, queue);
|
||||
context().svmFree(qInfo.hostcallBuffer_);
|
||||
}
|
||||
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue);
|
||||
queuePool_.erase(qIter);
|
||||
}
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue);
|
||||
|
||||
if (qInfo.hostcallBuffer_) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hostcall buffer %p for hardware queue %p",
|
||||
qInfo.hostcallBuffer_, queue);
|
||||
disableHostcalls(qInfo.hostcallBuffer_, queue);
|
||||
context().svmFree(qInfo.hostcallBuffer_);
|
||||
}
|
||||
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue);
|
||||
hsa_queue_destroy(queue);
|
||||
queuePool_.erase(qIter);
|
||||
}
|
||||
|
||||
void* Device::getOrCreateHostcallBuffer(hsa_queue_t* queue) {
|
||||
|
||||
@@ -432,7 +432,7 @@ class Device : public NullDevice {
|
||||
|
||||
//! Acquire HSA queue. This method can create a new HSA queue or
|
||||
//! share previously created
|
||||
hsa_queue_t* acquireQueue(uint32_t queue_size_hint);
|
||||
hsa_queue_t* acquireQueue(uint32_t queue_size_hint, bool coop_queue = false);
|
||||
|
||||
//! Release HSA queue
|
||||
void releaseQueue(hsa_queue_t*);
|
||||
|
||||
@@ -584,8 +584,9 @@ bool VirtualGPU::releaseGpuMemoryFence() {
|
||||
return true;
|
||||
}
|
||||
|
||||
VirtualGPU::VirtualGPU(Device& device)
|
||||
VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative)
|
||||
: device::VirtualDevice(device),
|
||||
state_(0),
|
||||
gpu_queue_(nullptr),
|
||||
roc_device_(device),
|
||||
virtualQueue_(nullptr),
|
||||
@@ -603,6 +604,8 @@ VirtualGPU::VirtualGPU(Device& device)
|
||||
// Initialize the last signal and dispatch flags
|
||||
timestamp_ = nullptr;
|
||||
hasPendingDispatch_ = false;
|
||||
profiling_ = profiling;
|
||||
cooperative_ = cooperative;
|
||||
|
||||
kernarg_pool_base_ = nullptr;
|
||||
kernarg_pool_size_ = 0;
|
||||
@@ -690,7 +693,7 @@ VirtualGPU::~VirtualGPU() {
|
||||
}
|
||||
}
|
||||
|
||||
bool VirtualGPU::create(bool profilingEna) {
|
||||
bool VirtualGPU::create() {
|
||||
// Checking Virtual gpu unique index for ROCm backend
|
||||
if (index() > device().settings().commandQueues_) {
|
||||
return false;
|
||||
@@ -698,10 +701,10 @@ bool VirtualGPU::create(bool profilingEna) {
|
||||
|
||||
// Pick a reasonable queue size
|
||||
uint32_t queue_size = 1024;
|
||||
gpu_queue_ = roc_device_.acquireQueue(queue_size);
|
||||
gpu_queue_ = roc_device_.acquireQueue(queue_size, cooperative_);
|
||||
if (!gpu_queue_) return false;
|
||||
|
||||
if (!initPool(dev().settings().kernargPoolSize_, (profilingEna) ? queue_size : 0)) {
|
||||
if (!initPool(dev().settings().kernargPoolSize_, (profiling_) ? queue_size : 0)) {
|
||||
LogError("Couldn't allocate arguments/signals for the queue");
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -163,10 +163,10 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
size_t maxMemObjectsInQueue_; //!< Maximum number of mem objects in the queue
|
||||
};
|
||||
|
||||
VirtualGPU(Device& device);
|
||||
VirtualGPU(Device& device, bool profiling = false, bool cooperative = false);
|
||||
~VirtualGPU();
|
||||
|
||||
bool create(bool profilingEna);
|
||||
bool create();
|
||||
bool terminate() { return true; }
|
||||
const Device& dev() const { return roc_device_; }
|
||||
|
||||
@@ -311,13 +311,17 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
std::vector<Memory*> xferWriteBuffers_; //!< Stage write buffers
|
||||
std::vector<amd::Memory*> pinnedMems_; //!< Pinned memory list
|
||||
|
||||
/**
|
||||
* @brief Indicates if a kernel dispatch is outstanding. This flag is
|
||||
* used to synchronized on kernel outputs.
|
||||
*/
|
||||
bool hasPendingDispatch_;
|
||||
//! Queue state flags
|
||||
union {
|
||||
struct {
|
||||
uint32_t hasPendingDispatch_ : 1; //!< A kernel dispatch is outstanding
|
||||
uint32_t imageBufferWrtBack_ : 1; //!< Image buffer write back is required
|
||||
uint32_t profiling_ : 1; //!< Profiling is enabled
|
||||
uint32_t cooperative_ : 1; //!< Cooperative launch is enabled
|
||||
};
|
||||
uint32_t state_;
|
||||
};
|
||||
|
||||
bool imageBufferWrtBack_; //!< Enable image buffer write back
|
||||
std::vector<device::Memory*> wrtBackImageBuffer_; //!< Array of images for write back
|
||||
|
||||
Timestamp* timestamp_;
|
||||
|
||||
Reference in New Issue
Block a user