From f8dc564915875c6eff34acc204091040c7facc0d Mon Sep 17 00:00:00 2001
From: foreman
Date: Fri, 25 Jul 2014 20:41:05 -0400
Subject: [PATCH] P4 to Git Change 1059564 by gandryey@gera-dev-w7 on
2014/07/25 18:14:33
ECR #304775 - Device enqueuing
- Run the scheduler in the host queue by default.
- GPU_USE_DEVICE_QUEUE=1 can force execution in the device queue
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#451 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.hpp#128 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#260 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusched.hpp#10 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuschedcl.cpp#23 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusettings.cpp#273 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusettings.hpp#86 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#327 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.hpp#119 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp#62 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.h#36 edit
... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#208 edit
[ROCm/clr commit: 3a4400135d4f1716beb799527806e06582b97a75]
---
.../rocclr/runtime/device/gpu/gpudevice.cpp | 1 -
.../rocclr/runtime/device/gpu/gpudevice.hpp | 1 -
.../rocclr/runtime/device/gpu/gpukernel.cpp | 12 ++-
.../rocclr/runtime/device/gpu/gpusched.hpp | 3 +
.../rocclr/runtime/device/gpu/gpuschedcl.cpp | 25 ++++--
.../rocclr/runtime/device/gpu/gpusettings.cpp | 3 +
.../rocclr/runtime/device/gpu/gpusettings.hpp | 3 +-
.../rocclr/runtime/device/gpu/gpuvirtual.cpp | 90 ++++++++++++-------
.../rocclr/runtime/device/gpu/gpuvirtual.hpp | 11 +--
.../device/gpu/gslbe/src/rt/GSLContext.cpp | 4 +-
.../device/gpu/gslbe/src/rt/GSLContext.h | 3 +-
projects/clr/rocclr/runtime/utils/flags.hpp | 2 +
12 files changed, 107 insertions(+), 51 deletions(-)
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp b/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp
index 025322fa8c..1d088139e2 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp
@@ -359,7 +359,6 @@ Device::Device()
: NullDevice()
, CALGSLDevice()
, numOfVgpus_(0)
- , numDeviceQueues_(0)
, context_(NULL)
, heap_(NULL)
, dummyPage_(NULL)
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp b/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp
index 2a6cb24aaf..e7f384ecd5 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp
@@ -468,7 +468,6 @@ public:
//! Returns the number of virtual GPUs allocated on this device
uint numOfVgpus() const { return numOfVgpus_; }
uint numOfVgpus_; //!< The number of virtual GPUs (lock protected)
- uint numDeviceQueues_; //!< Number of device queues
typedef std::vector VirtualGPUs;
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp
index 0a40baefed..142e3d7029 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp
@@ -3975,7 +3975,17 @@ HSAILKernel::loadArguments(
const amd::DeviceQueue* queue =
*reinterpret_cast(paramaddr);
VirtualGPU* gpuQueue = static_cast(queue->vDev());
- uint64_t vmQueue = gpuQueue->vQueue()->vmAddress();
+ uint64_t vmQueue;
+ if (dev().settings().useDeviceQueue_) {
+ vmQueue = gpuQueue->vQueue()->vmAddress();
+ }
+ else {
+ if (!gpu.createVirtualQueue(queue->size())) {
+ LogError( "Virtual queue creaiton failed!");
+ return false;
+ }
+ vmQueue = gpu.vQueue()->vmAddress();
+ }
WriteAqlArg(&aqlArgBuf, &vmQueue, sizeof(void*));
break;
}
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpusched.hpp b/projects/clr/rocclr/runtime/device/gpu/gpusched.hpp
index f0bcd2f4c0..bb456c3b60 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpusched.hpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpusched.hpp
@@ -66,6 +66,9 @@ struct SchedulerParam {
uint64_t scratch; //!< GPU address to the scratch buffer
uint32_t numMaxWaves; //!< The max number of possible waves
uint32_t releaseHostCP; //!< Releases CP on the host queue
+ uint64_t parentAQL; //!< Host parent AmdAqlWrap packet
+ uint32_t dedicatedQueue; //!< Scheduler uses a dedicated queue
+ uint32_t reserved; //!< Reserved field
};
} // namespace gpu
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuschedcl.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuschedcl.cpp
index 6390025655..98be8f0c39 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpuschedcl.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpuschedcl.cpp
@@ -90,6 +90,9 @@ typedef struct _SchedulerParam {
ulong scratch; //!< GPU address to the scratch buffer
uint numMaxWaves; //!< Num max waves on the asic
uint releaseHostCP; //!< Releases CP on the host queue
+ ulong parentAQL; //!< Host parent AmdAqlWrap packet
+ uint dedicatedQueue; //!< Scheduler uses a dedicated queue
+ uint reserved; //!< Reserved field
} SchedulerParam;
typedef struct _HwDispatch {
@@ -276,7 +279,9 @@ static inline bool
checkWaitEvents(__global AmdEvent** events, uint numEvents)
{
for (uint i = 0; i < numEvents; ++i) {
- if (atomic_and(&events[i]->state, 0xffffffff) != CL_COMPLETE) {
+ if (atomic_load_explicit(
+ (__global atomic_uint*)(&events[i]->state),
+ memory_order_acquire, memory_scope_device) != CL_COMPLETE) {
return false;
}
}
@@ -348,6 +353,8 @@ scheduler(
__global SchedulerParam* param = ¶ms[paramIdx];
volatile __global HwDispatch* hwDisp =
(volatile __global HwDispatch*)param->hw_queue;
+ __global AmdAqlWrap* hostParent = (__global AmdAqlWrap*)(param->parentAQL);
+ __global uint* counter = (__global uint*)(&hostParent->child_counter);
__global uint* signal = (__global uint*)(¶m->signal);
__global AmdAqlWrap* wraps = (__global AmdAqlWrap*)&queue[1];
__global uint* amask = (__global uint *)queue->aql_slot_mask;
@@ -360,7 +367,7 @@ scheduler(
}
uint launch = 0;
- uint loop;
+ uint loop = 1;
do {
uint mask = atomic_load_explicit((__global atomic_uint*)(&amask[get_group_id(0)]),
@@ -491,10 +498,16 @@ scheduler(
launch = atomic_load_explicit((__global atomic_uint*)¶m->launch,
memory_order_acquire, memory_scope_device);
- loop = atomic_load_explicit((__global atomic_uint*)signal,
- memory_order_acquire, memory_scope_device);
+ if (param->dedicatedQueue) {
+ loop = atomic_load_explicit((__global atomic_uint*)signal,
+ memory_order_acquire, memory_scope_device);
+ }
+ else {
+ loop = atomic_load_explicit((__global atomic_uint*)counter,
+ memory_order_acquire, memory_scope_device);
+ }
- } while ((launch == 0) && (loop == 1));
+ } while ((launch == 0) && (loop != 0));
if (loop == 0) {
//! \todo Write deadcode to the template, but somehow
@@ -504,6 +517,8 @@ scheduler(
hwDisp[1].condExe1 = 0xdeadc0de;
hwDisp[1].condExe2 = 0xdeadc0de;
hwDisp[1].condExe3 = 0xdeadc0de;
+ atomic_store_explicit((__global atomic_uint*)signal,
+ 0, memory_order_release, memory_scope_device);
barrier(CLK_GLOBAL_MEM_FENCE);
atomic_store_explicit((__global atomic_uint*)&hwDisp->startExe,
ResumeExecution, memory_order_release, memory_scope_device);
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpusettings.cpp b/projects/clr/rocclr/runtime/device/gpu/gpusettings.cpp
index 580976e51a..363ba49192 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpusettings.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpusettings.cpp
@@ -137,6 +137,9 @@ Settings::Settings()
// Use direct SRD by default
hsailDirectSRD_ = GPU_DIRECT_SRD;
+
+ // Use host queue for device enqueuing by default
+ useDeviceQueue_ = GPU_USE_DEVICE_QUEUE;
}
bool
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpusettings.hpp b/projects/clr/rocclr/runtime/device/gpu/gpusettings.hpp
index 5835b6ca39..9ea5f6b1c6 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpusettings.hpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpusettings.hpp
@@ -73,7 +73,8 @@ public:
uint apuSystem_: 1; //!< Device is APU system with shared memory
uint asyncMemCopy_: 1; //!< Use async memory transfers
uint hsailDirectSRD_: 1; //!< Controls direct SRD for HSAIL
- uint reserved_: 2;
+ uint useDeviceQueue_: 1; //!< Submit to separate device queue
+ uint reserved_: 1;
};
uint value_;
};
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp
index 865c216ea4..2664676f5e 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp
@@ -264,6 +264,24 @@ VirtualGPU::releasePinnedMem()
bool
VirtualGPU::createVirtualQueue(uint deviceQueueSize)
{
+ if (deviceQueueSize_ == deviceQueueSize) {
+ return true;
+ }
+ else {
+ //! @todo Temporarily keep the buffer mapped for debug purpose
+ if (NULL != schedParams_) {
+ schedParams_->unmap(this);
+ }
+ delete vqHeader_;
+ delete virtualQueue_;
+ delete schedParams_;
+ vqHeader_ = NULL;
+ virtualQueue_ = NULL;
+ schedParams_ = NULL;
+ schedParamIdx_ = 0;
+ deviceQueueSize_ = 0;
+ }
+
uint numSlots = deviceQueueSize / sizeof(AmdAqlWrap);
uint allocSize = deviceQueueSize;
@@ -339,6 +357,8 @@ VirtualGPU::createVirtualQueue(uint deviceQueueSize)
ptr = reinterpret_cast(schedParams_->map(this));
+ deviceQueueSize_ = deviceQueueSize;
+
return true;
}
@@ -365,6 +385,7 @@ VirtualGPU::VirtualGPU(
, virtualQueue_(NULL)
, schedParams_(NULL)
, schedParamIdx_(0)
+ , deviceQueueSize_(0)
, hsaQueueMem_(NULL)
{
memset(&cal_, 0, sizeof(CalVirtualDesc));
@@ -453,23 +474,7 @@ VirtualGPU::create(
#endif // !cl_amd_open_video
{
if (dev().engines().numComputeRings()) {
- uint idx;
-
- //! @todo Temporary workaround for Linux, because 2 HW queues only
- //! Fixes conformance failures with multi queues
- if ((0 == deviceQueueSize) || IS_WINDOWS) {
- //! @note: Add 1 to account the device queue for transfers
- idx = (index() + 1) % (dev().engines().numComputeRings() -
- gpuDevice_.numDeviceQueues_);
- }
- else {
- gpuDevice_.numDeviceQueues_++;
- if (gpuDevice_.numDeviceQueues_ >= dev().engines().numComputeRings()) {
- return false;
- }
- idx = (dev().engines().numComputeRings() - gpuDevice_.numDeviceQueues_)
- % dev().engines().numComputeRings();
- }
+ uint idx = index() % dev().engines().numComputeRings();
// hwRing_ should be set 0 if forced to have single scratch buffer
hwRing_ = (dev().settings().useSingleScratch_) ? 0 : idx;
@@ -558,10 +563,9 @@ VirtualGPU::create(
return false;
}
- //! @todo for testing only
- //deviceQueueSize = (deviceQueueSize == 0) ? (128 * Ki) : deviceQueueSize;
// Check if the app requested a device queue creation
- if ((0 != deviceQueueSize) && !createVirtualQueue(deviceQueueSize)) {
+ if (dev().settings().useDeviceQueue_ &&
+ (0 != deviceQueueSize) && !createVirtualQueue(deviceQueueSize)) {
LogError("Could not create a virtual queue!");
return false;
}
@@ -598,10 +602,6 @@ VirtualGPU::~VirtualGPU()
amd::ScopedLock k(dev().lockAsyncOps());
amd::ScopedLock lock(dev().vgpusAccess());
- if ((NULL != virtualQueue_) && IS_LINUX) {
- gpuDevice_.numDeviceQueues_--;
- }
-
uint i;
// Destroy all kernels
for (GslKernels::const_iterator it = gslKernels_.begin();
@@ -1696,13 +1696,19 @@ VirtualGPU::submitKernelInternalHSA(
return false;
}
else {
- gpuDefQueue = static_cast(defQueue->vDev());
+ if (dev().settings().useDeviceQueue_) {
+ gpuDefQueue = static_cast(defQueue->vDev());
+ if (gpuDefQueue->hwRing() == hwRing()) {
+ LogError("Can't submit the child kernels to the same HW ring as the host queue!");
+ return false;
+ }
+ }
+ else {
+ createVirtualQueue(defQueue->size());
+ gpuDefQueue = this;
+ }
}
vmDefQueue = gpuDefQueue->virtualQueue_->vmAddress();
- if (gpuDefQueue->hwRing() == hwRing()) {
- LogError("Can't submit the child kernels to the same HW ring as the host queue!");
- return false;
- }
// Add memory handles before the actual dispatch
memList.push_back(gpuDefQueue->virtualQueue_);
@@ -1830,6 +1836,14 @@ VirtualGPU::submitKernelInternalHSA(
}
}
+ if (!dev().settings().useDeviceQueue_) {
+ // Add the termination handshake to the host queue
+ virtualQueueHandshake(gpuEvent, gpuDefQueue->schedParams_->gslResource(),
+ vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
+ vmParentWrap + offsetof(AmdAqlWrap, child_counter),
+ 0, dev().settings().useDeviceQueue_);
+ }
+
// Get the global loop start before the scheduler
mcaddr loopStart = gpuDefQueue->virtualQueueDispatcherStart();
static_cast(gpuDefQueue->blitMgr()).runScheduler(
@@ -1842,6 +1856,7 @@ VirtualGPU::submitKernelInternalHSA(
// Get the address of PM4 template and add write it to params
//! @note DMA flush must not occur between patch and the scheduler
mcaddr patchStart = gpuDefQueue->virtualQueueDispatcherStart();
+
// Program parameters for the scheduler
SchedulerParam* param = &reinterpret_cast
(gpuDefQueue->schedParams_->data())[gpuDefQueue->schedParamIdx_];
@@ -1852,6 +1867,9 @@ VirtualGPU::submitKernelInternalHSA(
param->hsa_queue = gpuDefQueue->hsaQueueMem()->vmAddress();
param->launch = 0;
param->releaseHostCP = 0;
+ param->parentAQL = vmParentWrap;
+ param->dedicatedQueue = dev().settings().useDeviceQueue_;
+
// Fill the scratch buffer information
if (hsaKernel.prog().maxScratchRegs() > 0) {
gpu::Memory* scratchBuf = dev().scratch(gpuDefQueue->hwRing())->memObjs_[0];
@@ -1880,16 +1898,20 @@ VirtualGPU::submitKernelInternalHSA(
gpuDefQueue->virtualQueueDispatcherEnd(gpuEvent,
gpuDefQueue->vmMems(), gpuDefQueue->cal_.memCount_,
signalAddr, loopStart);
+
// Set GPU event for the used resources
for (uint i = 0; i < memList.size(); ++i) {
memList[i]->setBusy(*gpuDefQueue, gpuEvent);
}
- // Add the termination handshake to the host queue
- virtualQueueHandshake(gpuEvent, gpuDefQueue->schedParams_->gslResource(),
- vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
- vmParentWrap + offsetof(AmdAqlWrap, child_counter),
- signalAddr);
+ if (dev().settings().useDeviceQueue_) {
+ // Add the termination handshake to the host queue
+ virtualQueueHandshake(gpuEvent, gpuDefQueue->schedParams_->gslResource(),
+ vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
+ vmParentWrap + offsetof(AmdAqlWrap, child_counter),
+ signalAddr, dev().settings().useDeviceQueue_);
+ }
+
++gpuDefQueue->schedParamIdx_ %=
gpuDefQueue->schedParams_->size() / sizeof(SchedulerParam);
//! \todo optimize the wrap around
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.hpp b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.hpp
index 79af320c25..93b09a5a0f 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.hpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.hpp
@@ -387,6 +387,11 @@ public:
//! Update virtual queue header
void writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable);
+ //! Returns TRUE if virtual queue was successfully allocatted
+ bool createVirtualQueue(
+ uint deviceQueueSize //!< Device queue size
+ );
+
EngineType engineID_; //!< Engine ID for this VirtualGPU
ResourceSlots slots_; //!< Resource slots for kernel arguments
State state_; //!< virtual GPU current state
@@ -488,11 +493,6 @@ private:
const amd::BufferRect& dstRect //!< region of destination for copy
);
- //! Returns TRUE if virtual queue was successfully allocatted
- bool createVirtualQueue(
- uint deviceQueueSize //!< Device queue size
- );
-
GslKernels gslKernels_; //!< GSL kernel descriptors
GslKernelDesc* activeKernelDesc_; //!< active GSL kernel descriptors
GpuEvents gpuEvents_; //!< GPU events
@@ -529,6 +529,7 @@ private:
Memory* virtualQueue_; //!< Virtual device queue
Memory* schedParams_; //!< The scheduler parameters
uint schedParamIdx_; //!< Index in the scheduler parameters buffer
+ uint deviceQueueSize_; //!< Device queue size
Memory* hsaQueueMem_; //!< Memory for the amd_queue_t object
};
diff --git a/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp b/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp
index 46c078df59..303ff0c120 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp
@@ -1290,9 +1290,9 @@ CALGSLContext::virtualQueueDispatcherEnd(GpuEvent& event, const gslMemObject* me
void
CALGSLContext::virtualQueueHandshake(GpuEvent& event, const gslMemObject mem, mcaddr parentState,
- uint32 newStateValue, mcaddr parentChildCounter, mcaddr signal)
+ uint32 newStateValue, mcaddr parentChildCounter, mcaddr signal, bool dedicatedQueue)
{
eventBegin(MainEngine);
- m_cs->VirtualQueueHandshake(mem, parentState, newStateValue, parentChildCounter, signal);
+ m_cs->VirtualQueueHandshake(mem, parentState, newStateValue, parentChildCounter, signal, dedicatedQueue);
eventEnd(MainEngine, event);
}
diff --git a/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h b/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h
index 849101423e..2f7cc62cea 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h
+++ b/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h
@@ -47,7 +47,8 @@ public:
uint32 numMems, gslMemObject scratch, const void* cpuKernelCode, uint64 hsaQueueVA);
mcaddr virtualQueueDispatcherStart();
void virtualQueueDispatcherEnd(GpuEvent& event, const gslMemObject* mems, uint32 numMems, mcaddr signal, mcaddr loopStart);
- void virtualQueueHandshake(GpuEvent& event, const gslMemObject mem, mcaddr parentState, uint32 newStateValue, mcaddr parentChildCounter, mcaddr signal);
+ void virtualQueueHandshake(GpuEvent& event, const gslMemObject mem, mcaddr parentState,
+ uint32 newStateValue, mcaddr parentChildCounter, mcaddr signal, bool dedicatedQueue);
bool isDone(GpuEvent* event);
void waitForEvent(GpuEvent* event);
void flushIOCaches() const;
diff --git a/projects/clr/rocclr/runtime/utils/flags.hpp b/projects/clr/rocclr/runtime/utils/flags.hpp
index a36f7ea156..862c7d199b 100644
--- a/projects/clr/rocclr/runtime/utils/flags.hpp
+++ b/projects/clr/rocclr/runtime/utils/flags.hpp
@@ -150,6 +150,8 @@ release(uint, GPU_PRINT_CHILD_KERNEL, 0, \
"Prints the specified number of the child kernels") \
release(bool, GPU_DIRECT_SRD, true, \
"Use indirect SRD access in HSAIL") \
+release(bool, GPU_USE_DEVICE_QUEUE, false, \
+ "Use a dedicated device queue for the actual submissions") \
release(bool, AMD_DEPTH_MSAA_INTEROP, false, \
"Enable depth stencil and MSAA buffer interop") \
release(bool, AMD_THREAD_TRACE_ENABLE, false, \