SWDEV-184709 - support hipLaunchCooperativeKernel()

Add ROCr cooperative queue allocation

Change-Id: I1384482692f4080d31255b09e0f68a21ccad3da8


[ROCm/clr commit: 7ef8dfdfe7]
This commit is contained in:
German Andryeyev
2020-03-27 15:43:06 -04:00
committed by Aakash Sudhanwa
parent 782a76511b
commit 14aa72890e
4 changed files with 63 additions and 37 deletions
+43 -24
View File
@@ -1457,14 +1457,20 @@ device::VirtualDevice* Device::createVirtualDevice(amd::CommandQueue* queue) {
amd::ScopedLock lock(vgpusAccess());
bool profiling = (queue != nullptr) && queue->properties().test(CL_QUEUE_PROFILING_ENABLE);
bool cooperative = false;
profiling |= (queue == nullptr) ? true : false;
// If amd command queue is null, then it's an internal device queue
if (queue == nullptr) {
// In HIP mode the device queue will be allocated for the cooperative launches only
cooperative = amd::IS_HIP;
profiling = amd::IS_HIP;
}
// Initialization of heap and other resources occur during the command
// queue creation time.
VirtualGPU* virtualDevice = new VirtualGPU(*this);
VirtualGPU* virtualDevice = new VirtualGPU(*this, profiling, cooperative);
if (!virtualDevice->create(profiling)) {
if (!virtualDevice->create()) {
delete virtualDevice;
return nullptr;
}
@@ -1874,12 +1880,13 @@ VirtualGPU* Device::xferQueue() const {
return xferQueue_;
}
bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, cl_set_device_clock_mode_output_amd* pSetClockModeOutput) {
bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput,
cl_set_device_clock_mode_output_amd* pSetClockModeOutput) {
bool result = true;
return result;
}
hsa_queue_t *Device::acquireQueue(uint32_t queue_size_hint) {
hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue) {
assert(queuePool_.size() <= GPU_MAX_HW_QUEUES);
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "number of allocated hardware queues: %d, maximum: %d",
queuePool_.size(), GPU_MAX_HW_QUEUES);
@@ -1907,8 +1914,15 @@ hsa_queue_t *Device::acquireQueue(uint32_t queue_size_hint) {
}
auto queue_size = (queue_max_packets < queue_size_hint) ? queue_max_packets : queue_size_hint;
hsa_queue_t *queue;
while (hsa_queue_create(_bkendDevice, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr, nullptr,
hsa_queue_t* queue;
auto queue_type = HSA_QUEUE_TYPE_MULTI;
// Enable cooperative queue for the device queue
if (coop_queue) {
queue_type = HSA_QUEUE_TYPE_COOPERATIVE;
}
while (hsa_queue_create(_bkendDevice, queue_size, queue_type, nullptr, nullptr,
std::numeric_limits<uint>::max(), std::numeric_limits<uint>::max(),
&queue) != HSA_STATUS_SUCCESS) {
queue_size >>= 1;
@@ -1919,6 +1933,11 @@ hsa_queue_t *Device::acquireQueue(uint32_t queue_size_hint) {
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "created hardware queue %p with size %d",
queue, queue_size);
hsa_amd_profiling_set_profiler_enabled(queue, 1);
if (coop_queue) {
// Skip queue recycling for cooperative queues, since it should be just one
// per device.
return queue;
}
auto result = queuePool_.emplace(std::make_pair(queue, QueueInfo()));
assert(result.second && "QueueInfo already exists");
auto &qInfo = result.first->second;
@@ -1928,26 +1947,26 @@ hsa_queue_t *Device::acquireQueue(uint32_t queue_size_hint) {
void Device::releaseQueue(hsa_queue_t* queue) {
auto qIter = queuePool_.find(queue);
assert(qIter != queuePool_.end());
if (qIter != queuePool_.end()) {
auto &qInfo = qIter->second;
assert(qInfo.refCount > 0);
qInfo.refCount--;
if (qInfo.refCount != 0) {
return;
}
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue);
auto &qInfo = qIter->second;
assert(qInfo.refCount > 0);
qInfo.refCount--;
if (qInfo.refCount != 0) {
return;
if (qInfo.hostcallBuffer_) {
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hostcall buffer %p for hardware queue %p",
qInfo.hostcallBuffer_, queue);
disableHostcalls(qInfo.hostcallBuffer_, queue);
context().svmFree(qInfo.hostcallBuffer_);
}
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue);
queuePool_.erase(qIter);
}
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue);
if (qInfo.hostcallBuffer_) {
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hostcall buffer %p for hardware queue %p",
qInfo.hostcallBuffer_, queue);
disableHostcalls(qInfo.hostcallBuffer_, queue);
context().svmFree(qInfo.hostcallBuffer_);
}
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue);
hsa_queue_destroy(queue);
queuePool_.erase(qIter);
}
void* Device::getOrCreateHostcallBuffer(hsa_queue_t* queue) {
@@ -432,7 +432,7 @@ class Device : public NullDevice {
//! Acquire HSA queue. This method can create a new HSA queue or
//! share previously created
hsa_queue_t* acquireQueue(uint32_t queue_size_hint);
hsa_queue_t* acquireQueue(uint32_t queue_size_hint, bool coop_queue = false);
//! Release HSA queue
void releaseQueue(hsa_queue_t*);
@@ -584,8 +584,9 @@ bool VirtualGPU::releaseGpuMemoryFence() {
return true;
}
VirtualGPU::VirtualGPU(Device& device)
VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative)
: device::VirtualDevice(device),
state_(0),
gpu_queue_(nullptr),
roc_device_(device),
virtualQueue_(nullptr),
@@ -603,6 +604,8 @@ VirtualGPU::VirtualGPU(Device& device)
// Initialize the last signal and dispatch flags
timestamp_ = nullptr;
hasPendingDispatch_ = false;
profiling_ = profiling;
cooperative_ = cooperative;
kernarg_pool_base_ = nullptr;
kernarg_pool_size_ = 0;
@@ -690,7 +693,7 @@ VirtualGPU::~VirtualGPU() {
}
}
bool VirtualGPU::create(bool profilingEna) {
bool VirtualGPU::create() {
// Checking Virtual gpu unique index for ROCm backend
if (index() > device().settings().commandQueues_) {
return false;
@@ -698,10 +701,10 @@ bool VirtualGPU::create(bool profilingEna) {
// Pick a reasonable queue size
uint32_t queue_size = 1024;
gpu_queue_ = roc_device_.acquireQueue(queue_size);
gpu_queue_ = roc_device_.acquireQueue(queue_size, cooperative_);
if (!gpu_queue_) return false;
if (!initPool(dev().settings().kernargPoolSize_, (profilingEna) ? queue_size : 0)) {
if (!initPool(dev().settings().kernargPoolSize_, (profiling_) ? queue_size : 0)) {
LogError("Couldn't allocate arguments/signals for the queue");
return false;
}
+12 -8
View File
@@ -163,10 +163,10 @@ class VirtualGPU : public device::VirtualDevice {
size_t maxMemObjectsInQueue_; //!< Maximum number of mem objects in the queue
};
VirtualGPU(Device& device);
VirtualGPU(Device& device, bool profiling = false, bool cooperative = false);
~VirtualGPU();
bool create(bool profilingEna);
bool create();
bool terminate() { return true; }
const Device& dev() const { return roc_device_; }
@@ -311,13 +311,17 @@ class VirtualGPU : public device::VirtualDevice {
std::vector<Memory*> xferWriteBuffers_; //!< Stage write buffers
std::vector<amd::Memory*> pinnedMems_; //!< Pinned memory list
/**
* @brief Indicates if a kernel dispatch is outstanding. This flag is
* used to synchronized on kernel outputs.
*/
bool hasPendingDispatch_;
//! Queue state flags
union {
struct {
uint32_t hasPendingDispatch_ : 1; //!< A kernel dispatch is outstanding
uint32_t imageBufferWrtBack_ : 1; //!< Image buffer write back is required
uint32_t profiling_ : 1; //!< Profiling is enabled
uint32_t cooperative_ : 1; //!< Cooperative launch is enabled
};
uint32_t state_;
};
bool imageBufferWrtBack_; //!< Enable image buffer write back
std::vector<device::Memory*> wrtBackImageBuffer_; //!< Array of images for write back
Timestamp* timestamp_;