From 14aa72890ecee449bb3bd8cdc832b5acfb87a37c Mon Sep 17 00:00:00 2001 From: German Andryeyev Date: Fri, 27 Mar 2020 15:43:06 -0400 Subject: [PATCH] SWDEV-184709 - support hipLaunchCooperativeKernel() Add ROCr cooperative queue allocation Change-Id: I1384482692f4080d31255b09e0f68a21ccad3da8 [ROCm/clr commit: 7ef8dfdfe793fe9298dc9606bac680eb566e444d] --- projects/clr/rocclr/device/rocm/rocdevice.cpp | 67 ++++++++++++------- projects/clr/rocclr/device/rocm/rocdevice.hpp | 2 +- .../clr/rocclr/device/rocm/rocvirtual.cpp | 11 +-- .../clr/rocclr/device/rocm/rocvirtual.hpp | 20 +++--- 4 files changed, 63 insertions(+), 37 deletions(-) diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp index 6ab4be5070..edddf47316 100755 --- a/projects/clr/rocclr/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp @@ -1457,14 +1457,20 @@ device::VirtualDevice* Device::createVirtualDevice(amd::CommandQueue* queue) { amd::ScopedLock lock(vgpusAccess()); bool profiling = (queue != nullptr) && queue->properties().test(CL_QUEUE_PROFILING_ENABLE); + bool cooperative = false; - profiling |= (queue == nullptr) ? true : false; + // If amd command queue is null, then it's an internal device queue + if (queue == nullptr) { + // In HIP mode the device queue will be allocated for the cooperative launches only + cooperative = amd::IS_HIP; + profiling = amd::IS_HIP; + } // Initialization of heap and other resources occur during the command // queue creation time. - VirtualGPU* virtualDevice = new VirtualGPU(*this); + VirtualGPU* virtualDevice = new VirtualGPU(*this, profiling, cooperative); - if (!virtualDevice->create(profiling)) { + if (!virtualDevice->create()) { delete virtualDevice; return nullptr; } @@ -1874,12 +1880,13 @@ VirtualGPU* Device::xferQueue() const { return xferQueue_; } -bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, cl_set_device_clock_mode_output_amd* pSetClockModeOutput) { +bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, + cl_set_device_clock_mode_output_amd* pSetClockModeOutput) { bool result = true; return result; } -hsa_queue_t *Device::acquireQueue(uint32_t queue_size_hint) { +hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue) { assert(queuePool_.size() <= GPU_MAX_HW_QUEUES); ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "number of allocated hardware queues: %d, maximum: %d", queuePool_.size(), GPU_MAX_HW_QUEUES); @@ -1907,8 +1914,15 @@ hsa_queue_t *Device::acquireQueue(uint32_t queue_size_hint) { } auto queue_size = (queue_max_packets < queue_size_hint) ? queue_max_packets : queue_size_hint; - hsa_queue_t *queue; - while (hsa_queue_create(_bkendDevice, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr, nullptr, + hsa_queue_t* queue; + auto queue_type = HSA_QUEUE_TYPE_MULTI; + + // Enable cooperative queue for the device queue + if (coop_queue) { + queue_type = HSA_QUEUE_TYPE_COOPERATIVE; + } + + while (hsa_queue_create(_bkendDevice, queue_size, queue_type, nullptr, nullptr, std::numeric_limits::max(), std::numeric_limits::max(), &queue) != HSA_STATUS_SUCCESS) { queue_size >>= 1; @@ -1919,6 +1933,11 @@ hsa_queue_t *Device::acquireQueue(uint32_t queue_size_hint) { ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "created hardware queue %p with size %d", queue, queue_size); hsa_amd_profiling_set_profiler_enabled(queue, 1); + if (coop_queue) { + // Skip queue recycling for cooperative queues, since it should be just one + // per device. + return queue; + } auto result = queuePool_.emplace(std::make_pair(queue, QueueInfo())); assert(result.second && "QueueInfo already exists"); auto &qInfo = result.first->second; @@ -1928,26 +1947,26 @@ hsa_queue_t *Device::acquireQueue(uint32_t queue_size_hint) { void Device::releaseQueue(hsa_queue_t* queue) { auto qIter = queuePool_.find(queue); - assert(qIter != queuePool_.end()); + if (qIter != queuePool_.end()) { + auto &qInfo = qIter->second; + assert(qInfo.refCount > 0); + qInfo.refCount--; + if (qInfo.refCount != 0) { + return; + } + ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue); - auto &qInfo = qIter->second; - assert(qInfo.refCount > 0); - qInfo.refCount--; - if (qInfo.refCount != 0) { - return; + if (qInfo.hostcallBuffer_) { + ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hostcall buffer %p for hardware queue %p", + qInfo.hostcallBuffer_, queue); + disableHostcalls(qInfo.hostcallBuffer_, queue); + context().svmFree(qInfo.hostcallBuffer_); + } + + ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue); + queuePool_.erase(qIter); } - ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue); - - if (qInfo.hostcallBuffer_) { - ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hostcall buffer %p for hardware queue %p", - qInfo.hostcallBuffer_, queue); - disableHostcalls(qInfo.hostcallBuffer_, queue); - context().svmFree(qInfo.hostcallBuffer_); - } - - ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue); hsa_queue_destroy(queue); - queuePool_.erase(qIter); } void* Device::getOrCreateHostcallBuffer(hsa_queue_t* queue) { diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp index aadd1cb463..24c8b71de1 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.hpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp @@ -432,7 +432,7 @@ class Device : public NullDevice { //! Acquire HSA queue. This method can create a new HSA queue or //! share previously created - hsa_queue_t* acquireQueue(uint32_t queue_size_hint); + hsa_queue_t* acquireQueue(uint32_t queue_size_hint, bool coop_queue = false); //! Release HSA queue void releaseQueue(hsa_queue_t*); diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 2a18f97d8e..68b563dce6 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -584,8 +584,9 @@ bool VirtualGPU::releaseGpuMemoryFence() { return true; } -VirtualGPU::VirtualGPU(Device& device) +VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative) : device::VirtualDevice(device), + state_(0), gpu_queue_(nullptr), roc_device_(device), virtualQueue_(nullptr), @@ -603,6 +604,8 @@ VirtualGPU::VirtualGPU(Device& device) // Initialize the last signal and dispatch flags timestamp_ = nullptr; hasPendingDispatch_ = false; + profiling_ = profiling; + cooperative_ = cooperative; kernarg_pool_base_ = nullptr; kernarg_pool_size_ = 0; @@ -690,7 +693,7 @@ VirtualGPU::~VirtualGPU() { } } -bool VirtualGPU::create(bool profilingEna) { +bool VirtualGPU::create() { // Checking Virtual gpu unique index for ROCm backend if (index() > device().settings().commandQueues_) { return false; @@ -698,10 +701,10 @@ bool VirtualGPU::create(bool profilingEna) { // Pick a reasonable queue size uint32_t queue_size = 1024; - gpu_queue_ = roc_device_.acquireQueue(queue_size); + gpu_queue_ = roc_device_.acquireQueue(queue_size, cooperative_); if (!gpu_queue_) return false; - if (!initPool(dev().settings().kernargPoolSize_, (profilingEna) ? queue_size : 0)) { + if (!initPool(dev().settings().kernargPoolSize_, (profiling_) ? queue_size : 0)) { LogError("Couldn't allocate arguments/signals for the queue"); return false; } diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp index cfadcc397d..811d445a97 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp @@ -163,10 +163,10 @@ class VirtualGPU : public device::VirtualDevice { size_t maxMemObjectsInQueue_; //!< Maximum number of mem objects in the queue }; - VirtualGPU(Device& device); + VirtualGPU(Device& device, bool profiling = false, bool cooperative = false); ~VirtualGPU(); - bool create(bool profilingEna); + bool create(); bool terminate() { return true; } const Device& dev() const { return roc_device_; } @@ -311,13 +311,17 @@ class VirtualGPU : public device::VirtualDevice { std::vector xferWriteBuffers_; //!< Stage write buffers std::vector pinnedMems_; //!< Pinned memory list - /** - * @brief Indicates if a kernel dispatch is outstanding. This flag is - * used to synchronized on kernel outputs. - */ - bool hasPendingDispatch_; + //! Queue state flags + union { + struct { + uint32_t hasPendingDispatch_ : 1; //!< A kernel dispatch is outstanding + uint32_t imageBufferWrtBack_ : 1; //!< Image buffer write back is required + uint32_t profiling_ : 1; //!< Profiling is enabled + uint32_t cooperative_ : 1; //!< Cooperative launch is enabled + }; + uint32_t state_; + }; - bool imageBufferWrtBack_; //!< Enable image buffer write back std::vector wrtBackImageBuffer_; //!< Array of images for write back Timestamp* timestamp_;