From 9d158024305dc9941562b4629584c0558fba79c1 Mon Sep 17 00:00:00 2001
From: foreman
Date: Tue, 3 May 2016 14:32:32 -0400
Subject: [PATCH] P4 to Git Change 1264675 by gandryey@gera-w8 on 2016/05/03
14:13:52
SWDEV-86170 - Need OCL changes for Compute Unit Reservation
- Add support for RT and Medium priority queues
- Use the new packet for the CU mask programming. It will allow CU reservation for RT queue in KMD.
Affected files ...
... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_command.cpp#11 edit
... //depot/stg/opencl/drivers/opencl/library/hsa/hsail/src/devenq/schedule.cl#12 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#546 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.hpp#159 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpumemory.cpp#127 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#402 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.hpp#139 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp#81 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.h#52 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLDevice.cpp#165 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/backend.h#12 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/commandqueue.cpp#22 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/commandqueue.hpp#17 edit
---
rocclr/runtime/device/gpu/gpudevice.cpp | 32 +++++++-----
rocclr/runtime/device/gpu/gpudevice.hpp | 13 ++++-
rocclr/runtime/device/gpu/gpumemory.cpp | 4 ++
rocclr/runtime/device/gpu/gpuvirtual.cpp | 51 ++++++++++++-------
rocclr/runtime/device/gpu/gpuvirtual.hpp | 11 ++--
.../device/gpu/gslbe/src/rt/GSLContext.cpp | 7 +--
.../device/gpu/gslbe/src/rt/GSLContext.h | 2 +-
.../device/gpu/gslbe/src/rt/GSLDevice.cpp | 1 +
.../runtime/device/gpu/gslbe/src/rt/backend.h | 1 +
rocclr/runtime/platform/commandqueue.cpp | 5 +-
rocclr/runtime/platform/commandqueue.hpp | 17 ++++++-
11 files changed, 98 insertions(+), 46 deletions(-)
diff --git a/rocclr/runtime/device/gpu/gpudevice.cpp b/rocclr/runtime/device/gpu/gpudevice.cpp
index 88534a51f3..e2f7d137fd 100644
--- a/rocclr/runtime/device/gpu/gpudevice.cpp
+++ b/rocclr/runtime/device/gpu/gpudevice.cpp
@@ -174,7 +174,7 @@ NullDevice::create(CALtarget target)
calAttr.localRAM = 512;
// Fill the device info structure
- fillDeviceInfo(calAttr, memInfo, 4096, 1);
+ fillDeviceInfo(calAttr, memInfo, 4096, 1, 0);
if (settings().hsail_ || (settings().oclVersion_ == OpenCL20)) {
// Runtime doesn't know what local size could be on the real board
@@ -280,11 +280,14 @@ NullDevice::createProgram(amd::option::Options* options)
return new NullProgram(*this);
}
-void NullDevice::fillDeviceInfo(
+void
+NullDevice::fillDeviceInfo(
const CALdeviceattribs& calAttr,
const gslMemInfo& memInfo,
size_t maxTextureSize,
- uint numComputeRings)
+ uint numComputeRings,
+ uint numComputeRingsRT
+ )
{
info_.type_ = CL_DEVICE_TYPE_GPU;
info_.vendorId_ = 0x1002;
@@ -549,8 +552,8 @@ void NullDevice::fillDeviceInfo(
info_.localMemBanks_ = hwInfo()->localMemBanks_;
info_.gfxipVersion_ = hwInfo()->gfxipVersion_;
info_.numAsyncQueues_ = numComputeRings;
- info_.numRTQueues_ = 2;
- info_.numRTCUs_ = 4;
+ info_.numRTQueues_ = numComputeRingsRT;
+ info_.numRTCUs_ = calAttr.maxRTCUs;
info_.threadTraceEnable_ = settings().threadTraceEnable_;
}
}
@@ -576,6 +579,7 @@ void
Device::Engines::create(uint num, gslEngineDescriptor* desc, uint maxNumComputeRings)
{
numComputeRings_ = 0;
+ numComputeRingsRT_ = 0;
numDmaEngines_ = 0;
for (uint i = 0; i < num; ++i) {
@@ -587,6 +591,13 @@ Device::Engines::create(uint num, gslEngineDescriptor* desc, uint maxNumComputeR
numComputeRings_++;
}
+ if (desc[i].id == GSL_ENGINEID_COMPUTE_RT) {
+ numComputeRingsRT_++;
+ }
+ if (desc[i].id == GSL_ENGINEID_COMPUTE_MEDIUM_PRIORITY) {
+ numComputeRingsRT_++;
+ }
+
if (desc[i].id >= GSL_ENGINEID_DRMDMA0 &&
desc[i].id <= GSL_ENGINEID_DRMDMA1) {
numDmaEngines_++;
@@ -910,7 +921,7 @@ Device::create(CALuint ordinal, CALuint numOfDevices)
// Fill the device info structure
fillDeviceInfo(getAttribs(), getMemInfo(),
static_cast(getMaxTextureSize()),
- engines().numComputeRings());
+ engines().numComputeRings(), engines().numComputeRingsRT());
if (settings().hsail_ || (settings().oclVersion_ == OpenCL20)) {
if (NULL == hsaCompiler_) {
@@ -969,7 +980,7 @@ Device::initializeHeapResources()
PerformFullInitialization();
- uint numComputeRings = engines_.numComputeRings();
+ uint numComputeRings = engines_.numComputeRings() + engines_.numComputeRingsRT();
scratch_.resize((settings().useSingleScratch_) ? 1 : (numComputeRings ? numComputeRings : 1));
// Initialize the number of mem object for the scratch buffer
@@ -1074,7 +1085,7 @@ Device::createVirtualDevice(
{
bool profiling = false;
bool interopQueue = false;
- uint rtCUs = 0;
+ uint rtCUs = amd::CommandQueue::RealTimeDisabled;
uint deviceQueueSize = 0;
if (queue != NULL) {
@@ -1101,10 +1112,7 @@ Device::createVirtualDevice(
}
VirtualGPU* vgpu = new VirtualGPU(*this);
- if (vgpu && vgpu->create(
- profiling
- , deviceQueueSize
- )) {
+ if (vgpu && vgpu->create(profiling, rtCUs, deviceQueueSize, queue->priority())) {
return vgpu;
} else {
delete vgpu;
diff --git a/rocclr/runtime/device/gpu/gpudevice.hpp b/rocclr/runtime/device/gpu/gpudevice.hpp
index 55328d48ae..9c4d4027ea 100644
--- a/rocclr/runtime/device/gpu/gpudevice.hpp
+++ b/rocclr/runtime/device/gpu/gpudevice.hpp
@@ -129,7 +129,8 @@ protected:
const CALdeviceattribs& calAttr, //!< CAL device attributes info
const gslMemInfo& memInfo, //!< GSL mem info
size_t maxTextureSize, //!< Maximum texture size supported in HW
- uint numComputeRings //!< Number of compute rings
+ uint numComputeRings, //!< Number of compute rings
+ uint numComputeRingsRT //!< Number of RT compute rings
);
};
@@ -237,7 +238,11 @@ public:
{
public:
//! Default constructor
- Engines() { memset(desc_, 0xff, sizeof(desc_)); }
+ Engines()
+ : numComputeRings_(0)
+ , numComputeRingsRT_(0)
+ , numDmaEngines_(0)
+ { memset(desc_, 0xff, sizeof(desc_)); }
//! Creates engine descriptor for this class
void create(uint num, gslEngineDescriptor* desc, uint maxNumComputeRings);
@@ -251,11 +256,15 @@ public:
//! Returns the number of available compute rings
uint numComputeRings() const { return numComputeRings_; }
+ //! Returns the number of available real time compute rings
+ uint numComputeRingsRT() const { return numComputeRingsRT_; }
+
//! Returns the number of available DMA engines
uint numDMAEngines() const { return numDmaEngines_; }
private:
uint numComputeRings_;
+ uint numComputeRingsRT_;
uint numDmaEngines_;
gslEngineDescriptor desc_[GSL_ENGINEID_MAX]; //!< Engine descriptor
};
diff --git a/rocclr/runtime/device/gpu/gpumemory.cpp b/rocclr/runtime/device/gpu/gpumemory.cpp
index 807b26d6d4..6fd4bfd7b3 100644
--- a/rocclr/runtime/device/gpu/gpumemory.cpp
+++ b/rocclr/runtime/device/gpu/gpumemory.cpp
@@ -171,6 +171,7 @@ Memory::create(
// Check if CAL created a resource
if (result) {
switch (memoryType()) {
+ case Resource::Persistent:
case Resource::Pinned:
case Resource::ExternalPhysical:
// Marks memory object for direct GPU access to the host memory
@@ -186,6 +187,9 @@ Memory::create(
case Resource::View: {
Resource::ViewParams* view =
reinterpret_cast(params);
+ if (view->resource_->memoryType() == Resource::Persistent) {
+ flags_ |= HostMemoryDirectAccess;
+ }
// Check if parent was allocated in system memory
if ((view->resource_->memoryType() == Resource::Pinned) ||
(((view->resource_->memoryType() == Resource::Remote) ||
diff --git a/rocclr/runtime/device/gpu/gpuvirtual.cpp b/rocclr/runtime/device/gpu/gpuvirtual.cpp
index ef9773d503..2ec9386fae 100644
--- a/rocclr/runtime/device/gpu/gpuvirtual.cpp
+++ b/rocclr/runtime/device/gpu/gpuvirtual.cpp
@@ -196,7 +196,7 @@ VirtualGPU::DmaFlushMgmt::isCbReady(
}
bool
-VirtualGPU::gslOpen(uint nEngines, gslEngineDescriptor *engines)
+VirtualGPU::gslOpen(uint nEngines, gslEngineDescriptor *engines, uint32_t rtCUs)
{
// GSL device initialization
dev().PerformFullInitialization();
@@ -206,7 +206,7 @@ VirtualGPU::gslOpen(uint nEngines, gslEngineDescriptor *engines)
? CAL_WAIT_LOW_CPU_UTILIZATION
: CAL_WAIT_POLLING;
- if (!open(&dev(), nEngines, engines)) {
+ if (!open(&dev(), nEngines, engines, rtCUs)) {
return false;
}
@@ -432,10 +432,8 @@ VirtualGPU::VirtualGPU(
}
bool
-VirtualGPU::create(
- bool profiling
- , uint deviceQueueSize
- )
+VirtualGPU::create(bool profiling, uint rtCUs, uint deviceQueueSize,
+ amd::CommandQueue::Priority priority)
{
device::BlitManager::Setup blitSetup;
gslEngineDescriptor engines[2];
@@ -452,14 +450,34 @@ VirtualGPU::create(
{
if (dev().engines().numComputeRings()) {
- uint idx = index() % dev().engines().numComputeRings();
+ uint idx;
+ if ((amd::CommandQueue::RealTimeDisabled == rtCUs) &&
+ (priority == amd::CommandQueue::Priority::Normal)) {
+ idx = index() % dev().engines().numComputeRings();
+ engineMask = dev().engines().getMask(
+ (gslEngineID)(dev().isComputeRingIDForced() ?
+ dev().getforcedComputeEngineID() :
+ (dev().getFirstAvailableComputeEngineID() + idx)));
+
+ }
+ else {
+ if (priority == amd::CommandQueue::Priority::Medium) {
+ engineMask = dev().engines().getMask((gslEngineID)
+ (GSL_ENGINEID_COMPUTE_MEDIUM_PRIORITY));
+ }
+ else {
+ engineMask = dev().engines().getMask((gslEngineID)
+ (GSL_ENGINEID_COMPUTE_RT));
+ }
+ //!@todo This is not a generic solution and
+ // may have issues with > 8 queues
+ idx = index() % (dev().engines().numComputeRings() +
+ dev().engines().numComputeRingsRT());
+ }
// hwRing_ should be set 0 if forced to have single scratch buffer
hwRing_ = (dev().settings().useSingleScratch_) ? 0 : idx;
- engineMask = dev().engines().getMask((gslEngineID)(dev().isComputeRingIDForced() ?
- dev().getforcedComputeEngineID() :
- (dev().getFirstAvailableComputeEngineID() + idx)));
if (dev().canDMA()) {
// If only 1 DMA engine is available then use that one
if (dev().engines().numDMAEngines() < 2) {
@@ -479,12 +497,12 @@ VirtualGPU::create(
engineMask |= dev().engines().getMask(GSL_ENGINEID_DRMDMA0);
}
}
- num = dev().engines().getRequested(engineMask, engines);
+ }
+ num = dev().engines().getRequested(engineMask, engines);
- // Open GSL context
- if ((num == 0) || !gslOpen(num, engines)) {
- return false;
- }
+ // Open GSL context
+ if ((num == 0) || !gslOpen(num, engines, rtCUs)) {
+ return false;
}
// Diable double copy optimization,
@@ -1178,7 +1196,6 @@ VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& vcmd)
{
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
-
gpu::Memory* memory = dev().getGpuMemory(&vcmd.memory());
amd::Memory* owner = memory->owner();
bool unmapMip = false;
@@ -2831,7 +2848,6 @@ VirtualGPU::flushDMA(uint engineID)
//! since only L2 cache is flushed in KMD frame,
//! but L1 still has to be invalidated.
}
-
//! \note Use CtxIsEventDone, so we won't flush compute for DRM engine
isDone(&cal_.events_[engineID]);
}
@@ -2841,7 +2857,6 @@ VirtualGPU::waitAllEngines(CommandBatch* cb)
{
uint i;
GpuEvent* events; //!< GPU events for the batch
-
// If command batch is NULL then wait for the current
if (NULL == cb) {
events = cal_.events_;
diff --git a/rocclr/runtime/device/gpu/gpuvirtual.hpp b/rocclr/runtime/device/gpu/gpuvirtual.hpp
index c8dc7a9bc0..e3cce21d2f 100644
--- a/rocclr/runtime/device/gpu/gpuvirtual.hpp
+++ b/rocclr/runtime/device/gpu/gpuvirtual.hpp
@@ -10,6 +10,7 @@
#include "device/gpu/gpuprintf.hpp"
#include "device/gpu/gputimestamp.hpp"
#include "device/gpu/gpusched.hpp"
+#include "platform/commandqueue.hpp"
#include "device/blit.hpp"
#include "device/gpu/gpudebugger.hpp"
@@ -199,12 +200,10 @@ public:
typedef std::vector ResourceSlots;
public:
-
VirtualGPU(Device& device);
- bool create(
- bool profiling
- , uint deviceQueueSize = 0
- );
+ bool create(bool profiling, uint rtCUs = amd::CommandQueue::RealTimeDisabled,
+ uint deviceQueueSize = 0,
+ amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal);
~VirtualGPU();
void submitReadMemory(amd::ReadMemoryCommand& vcmd);
@@ -443,7 +442,7 @@ private:
//! Frees CAL kernel descriptor of the virtual device
void freeKernelDesc(GslKernelDesc* desc);
- bool gslOpen(uint nEngines, gslEngineDescriptor *engines);
+ bool gslOpen(uint nEngines, gslEngineDescriptor *engines, uint32_t rtCUs);
void gslDestroy();
//! Releases stage write buffers
diff --git a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp
index 7e52e6c493..41fe0c427c 100644
--- a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp
+++ b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp
@@ -46,7 +46,8 @@ bool
CALGSLContext::open(
const CALGSLDevice* pDeviceObject,
uint32 nEngines,
- gslEngineDescriptor* engines)
+ gslEngineDescriptor* engines,
+ uint32 rtCUs)
{
m_Dev = pDeviceObject;
@@ -63,7 +64,7 @@ CALGSLContext::open(
for (uint i = 0; i < nEngines; i++)
{
if (engines[i].id >= GSL_ENGINEID_3DCOMPUTE0 &&
- engines[i].id <= GSL_ENGINEID_COMPUTE7)
+ engines[i].id <= GSL_ENGINEID_COMPUTE_MEDIUM_PRIORITY)
{
mainEngineOrdinal = engines[i].id;
}
@@ -76,7 +77,7 @@ CALGSLContext::open(
}
}
- m_cs = native->createComputeContext(mainEngineOrdinal, sdmaOrdinal, false);
+ m_cs = native->createComputeContext(mainEngineOrdinal, sdmaOrdinal, false, rtCUs);
if (m_cs == 0)
{
diff --git a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h
index 61b77c9435..06cde42b6d 100644
--- a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h
+++ b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h
@@ -29,7 +29,7 @@ public:
CALGSLContext();
~CALGSLContext();
- bool open(const CALGSLDevice* pDeviceObject, uint32 nEngines, gslEngineDescriptor *engines);
+ bool open(const CALGSLDevice* pDeviceObject, uint32 nEngines, gslEngineDescriptor *engines, uint32 rtCUs = 0);
void close(gsl::gsAdaptor* native);
bool setInput(uint32 physUnit, gslMemObject mem);
diff --git a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLDevice.cpp b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLDevice.cpp
index 473c96d04d..05b858c1ca 100644
--- a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLDevice.cpp
+++ b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLDevice.cpp
@@ -119,6 +119,7 @@ CALGSLDevice::getAttribs_int(gsl::gsCtx* cs)
m_attribs.isOpenCL200Device = m_adp->pAsicInfo->bIsOpen2Device;
m_attribs.isSVMFineGrainSystem = m_adp->pAsicInfo->svmFineGrainSystem;
m_attribs.isWDDM2Enabled = m_adp->pAsicInfo->vaAvailable && m_adp->pAsicInfo->bNoVATranslation;
+ m_attribs.maxRTCUs = cs->getMaxRTCUs();
}
bool
diff --git a/rocclr/runtime/device/gpu/gslbe/src/rt/backend.h b/rocclr/runtime/device/gpu/gslbe/src/rt/backend.h
index 0308691384..f94332bcf8 100644
--- a/rocclr/runtime/device/gpu/gslbe/src/rt/backend.h
+++ b/rocclr/runtime/device/gpu/gslbe/src/rt/backend.h
@@ -84,6 +84,7 @@ typedef struct CALdeviceattribsRec {
bool isOpenCL200Device; /**< the flag to mark if the device is OpenCL 200 */
bool isSVMFineGrainSystem; /**< check if SVM finegrainsystem */
bool isWDDM2Enabled; /**< check if WDDM2 is enabled */
+ CALuint maxRTCUs; /**< The maximum number of RT CUs for RT queues */
} CALdeviceattribs;
diff --git a/rocclr/runtime/platform/commandqueue.cpp b/rocclr/runtime/platform/commandqueue.cpp
index 1f3071440d..a954685ac2 100644
--- a/rocclr/runtime/platform/commandqueue.cpp
+++ b/rocclr/runtime/platform/commandqueue.cpp
@@ -18,10 +18,11 @@
namespace amd {
HostQueue::HostQueue(
- Context& context, Device& device, cl_command_queue_properties properties, uint queueRTCUs
+ Context& context, Device& device,
+ cl_command_queue_properties properties, uint queueRTCUs, Priority priority
)
: CommandQueue(context, device, properties, device.info().queueProperties_
- | CL_QUEUE_COMMAND_INTERCEPT_ENABLE_AMD, queueRTCUs)
+ | CL_QUEUE_COMMAND_INTERCEPT_ENABLE_AMD, queueRTCUs, priority)
{
if (thread_.state() >= Thread::INITIALIZED) {
ScopedLock sl(queueLock_);
diff --git a/rocclr/runtime/platform/commandqueue.hpp b/rocclr/runtime/platform/commandqueue.hpp
index 05ad3b42d3..3bea84fc45 100644
--- a/rocclr/runtime/platform/commandqueue.hpp
+++ b/rocclr/runtime/platform/commandqueue.hpp
@@ -31,6 +31,12 @@ class DeviceQueue;
class CommandQueue : public RuntimeObject
{
public:
+ static const uint RealTimeDisabled = 0xffffffff;
+ enum class Priority : uint {
+ Normal = 0,
+ Medium
+ };
+
struct Properties
{
typedef cl_command_queue_properties value_type;
@@ -92,6 +98,9 @@ public:
//! Returns the number or requested real time CUs
uint rtCUs() const { return rtCUs_; }
+ //! Returns the queue priority
+ Priority priority() const { return priority_; }
+
protected:
//! CommandQueue constructor is protected
//! to keep the CommandQueue class as a virtual interface
@@ -100,16 +109,19 @@ protected:
Device& device, //!< Device object
cl_command_queue_properties properties, //!< Queue properties
cl_command_queue_properties propMask, //!< Queue properties mask
- uint rtCUs = 0 //!< Avaialble real time compute units
+ uint rtCUs = RealTimeDisabled, //!< Avaialble real time compute units
+ Priority priority = Priority::Normal //!< Queue priority
)
: properties_(propMask, properties)
, rtCUs_(rtCUs)
+ , priority_(priority)
, queueLock_("CommandQueue::queueLock")
, device_(device)
, context_(context) {}
Properties properties_; //!< Queue properties
uint rtCUs_; //!< The number of used RT compute units
+ Priority priority_; //!< Queue priority
Monitor queueLock_; //!< Lock protecting the queue
Device& device_; //!< The device
SharedReference context_; //!< The context of this command queue
@@ -179,7 +191,8 @@ public:
Context& context,
Device& device,
cl_command_queue_properties properties,
- uint queueRTCUs = 0
+ uint queueRTCUs = 0,
+ Priority priority = Priority::Normal
);
//! Returns TRUE if this command queue can accept commands.