From f998e7094be56ac7a127f168368c024ad1c19092 Mon Sep 17 00:00:00 2001
From: foreman
Date: Tue, 22 Jul 2014 17:30:56 -0400
Subject: [PATCH] P4 to Git Change 1057998 by gandryey@gera-dev-w7 on
2014/07/22 17:15:58
ECR #304775 - Device enqueuing
- Use atomic fetch for enqueue flags
- Switch to a multithreaded scheduler
- Add a workaround for Linux host_multi_queue failures. Linux has only 2 queues, but the test allocates multiple host queues and the same HW ring can be used
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpublit.cpp#106 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#449 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.hpp#127 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuschedcl.cpp#22 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#325 edit
[ROCm/clr commit: d2b905f18eeace32bce0d1ffdc22ca5109b580c9]
---
.../clr/rocclr/runtime/device/gpu/gpublit.cpp | 2 +-
.../rocclr/runtime/device/gpu/gpudevice.cpp | 1 +
.../rocclr/runtime/device/gpu/gpudevice.hpp | 3 +-
.../rocclr/runtime/device/gpu/gpuschedcl.cpp | 223 +++++++++---------
.../rocclr/runtime/device/gpu/gpuvirtual.cpp | 22 +-
5 files changed, 136 insertions(+), 115 deletions(-)
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpublit.cpp b/projects/clr/rocclr/runtime/device/gpu/gpublit.cpp
index cd5fd38ca7..34dfc5ded9 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpublit.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpublit.cpp
@@ -2732,7 +2732,7 @@ KernelBlitManager::runScheduler(
size_t dim = 1;
size_t globalWorkOffset[1] = { 0 };
- size_t globalWorkSize[1] = { 1 };
+ size_t globalWorkSize[1] = { numSlots / 32 };
size_t localWorkSize[1] = { 1 };
// Program kernels arguments
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp b/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp
index 62404e1e9c..35590b04e2 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp
@@ -359,6 +359,7 @@ Device::Device()
: NullDevice()
, CALGSLDevice()
, numOfVgpus_(0)
+ , numDeviceQueues_(0)
, context_(NULL)
, heap_(NULL)
, dummyPage_(NULL)
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp b/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp
index c276f4d739..2a6cb24aaf 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp
@@ -467,7 +467,8 @@ public:
//! Returns the number of virtual GPUs allocated on this device
uint numOfVgpus() const { return numOfVgpus_; }
- uint numOfVgpus_; //!< The number of virtual GPUs (lock protected)
+ uint numOfVgpus_; //!< The number of virtual GPUs (lock protected)
+ uint numDeviceQueues_; //!< Number of device queues
typedef std::vector VirtualGPUs;
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuschedcl.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuschedcl.cpp
index 8b2811ab57..6390025655 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpuschedcl.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpuschedcl.cpp
@@ -69,7 +69,7 @@ typedef struct _AmdAqlWrap {
ulong completion; //!< [LWO/SRO] CL event for the current execution (clk_event_t)
ulong parent_wrap; //!< [LWO/SRO] Pointer to the parent AQL wrapper (AmdAqlWrap*)
ulong wait_list; //!< [LRO/SRO] Pointer to an array of clk_event_t objects (64 bytes default)
- uint wait_num; //!< [LWO/SRO] The number of cl_event_wait objects
+ uint wait_num; //!< [LWO/SRO] The number of cl_event_wait objects
uint reserved[5]; //!< For the future usage
HsaAqlDispatchPacket aql; //!< [LWO/SRO] AQL packet – 64 bytes AQL packet
} AmdAqlWrap;
@@ -163,7 +163,7 @@ const uint StallExecution = 0x00000000; // 0x01000000
const uint WavefrontSize = 64;
const uint MaxWaveSize = 0x400;
-static inline void
+static inline void
dispatch(
volatile __global HwDispatch* dispatch,
__global HsaAqlDispatchPacket* aqlPkt,
@@ -251,7 +251,7 @@ dispatch(
usrRegCnt += (flags & 0x8) ? 2 : 0;
dispatch->argsLo = (uint)aqlPkt->kernel_arg_address;
dispatch->argsHi = (uint)(aqlPkt->kernel_arg_address >> 32);
-
+
// flatScratchEna = (flags & 0x20);
if (flags & 0x20) {
dispatch->copyData = Pm4CopyReg;
@@ -363,128 +363,129 @@ scheduler(
uint loop;
do {
- for (uint m = 0; m < (queue->aql_slot_num >> 5); ++m) {
- uint mask = atomic_load_explicit((__global atomic_uint*)(&amask[m]),
- memory_order_acquire, memory_scope_device);
+ uint mask = atomic_load_explicit((__global atomic_uint*)(&amask[get_group_id(0)]),
+ memory_order_acquire, memory_scope_device);
- int baseIdx = m * 32;
- while (mask != 0) {
- uint sIdx = ctz(mask);
- uint idx = baseIdx + sIdx;
- mask &= ~(1 << sIdx);
- __global AmdAqlWrap* disp = (__global AmdAqlWrap*)&wraps[idx];
- uint slotState = atomic_load_explicit((__global atomic_uint*)(&disp->state),
- memory_order_acquire, memory_scope_device);
- __global AmdAqlWrap* parent = (__global AmdAqlWrap*)(disp->parent_wrap);
- __global AmdEvent* event = (__global AmdEvent*)(disp->completion);
+ int baseIdx = get_group_id(0) * 32;
+ while (mask != 0) {
+ uint sIdx = ctz(mask);
+ uint idx = baseIdx + sIdx;
+ mask &= ~(1 << sIdx);
+ __global AmdAqlWrap* disp = (__global AmdAqlWrap*)&wraps[idx];
+ uint slotState = atomic_load_explicit((__global atomic_uint*)(&disp->state),
+ memory_order_acquire, memory_scope_device);
+ __global AmdAqlWrap* parent = (__global AmdAqlWrap*)(disp->parent_wrap);
+ __global AmdEvent* event = (__global AmdEvent*)(disp->completion);
- // Check if the current slot is ready for processing
- if (slotState == AQL_WRAP_READY) {
- if (launch == 0) {
- launch = atomic_load_explicit((__global atomic_uint*)¶m->launch,
- memory_order_acquire, memory_scope_device);
+ // Check if the current slot is ready for processing
+ if (slotState == AQL_WRAP_READY) {
+ if (launch == 0) {
+ launch = atomic_load_explicit((__global atomic_uint*)¶m->launch,
+ memory_order_acquire, memory_scope_device);
+ }
+ if (launch == 0) {
+ // Attempt to find a new disaptch if nothing was launched yet
+ uint parentState = atomic_load_explicit(
+ (__global atomic_uint*)(&parent->state),
+ memory_order_acquire, memory_scope_device);
+ uint enqueueFlags = atomic_load_explicit(
+ (__global atomic_uint*)(&disp->enqueue_flags),
+ memory_order_acquire, memory_scope_device);
+
+ // Check the launch flags
+ if (((enqueueFlags == CLK_ENQUEUE_FLAGS_WAIT_KERNEL) ||
+ (enqueueFlags == CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP)) &&
+ (parentState != AQL_WRAP_DONE)) {
+ continue;
}
- if (launch == 0) {
- // Attempt to find a new disaptch if nothing was launched yet
- uint parentState = atomic_load_explicit(
- (__global atomic_uint*)(&parent->state),
- memory_order_acquire, memory_scope_device);
- // Check the launch flags
- if (((disp->enqueue_flags == CLK_ENQUEUE_FLAGS_WAIT_KERNEL) ||
- (disp->enqueue_flags == CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP)) &&
- (parentState != AQL_WRAP_DONE)) {
- continue;
- }
-
- // Check if the wait list is COMPLETE
- launch = checkWaitEvents(
- (__global AmdEvent**)(disp->wait_list), disp->wait_num);
-
- if (launch == 0) continue;
-
- uint tmp = 0;
- if (atomic_compare_exchange_strong_explicit(
- (__global atomic_uint*)¶m->launch, &tmp, launch,
- memory_order_acq_rel, memory_order_acq_rel, memory_scope_device)) {
- if (event != 0) {
- event->timer[PROFILING_COMMAND_START] =
- (__hsail_get_clock() * (ulong)param->eng_clk) >> 10;
- }
- // Launch child kernel ....
- dispatch(hwDisp, &disp->aql, param->scratchSize, param->numMaxWaves,
- param->scratch, param->hsa_queue);
- disp->state = AQL_WRAP_BUSY;
- releaseWaitEvents((__global AmdEvent**)(disp->wait_list),
- disp->wait_num, (__global uint*)queue->event_slot_mask,
- (__global AmdEvent*)queue->event_slots);
- break;
+ // Check if the wait list is COMPLETE
+ launch = checkWaitEvents(
+ (__global AmdEvent**)(disp->wait_list), disp->wait_num);
+
+ if (launch == 0) continue;
+
+ uint tmp = 0;
+ if (atomic_compare_exchange_strong_explicit(
+ (__global atomic_uint*)¶m->launch, &tmp, launch,
+ memory_order_acq_rel, memory_order_acq_rel, memory_scope_device)) {
+ if (event != 0) {
+ event->timer[PROFILING_COMMAND_START] =
+ (__hsail_get_clock() * (ulong)param->eng_clk) >> 10;
}
+ // Launch child kernel ....
+ dispatch(hwDisp, &disp->aql, param->scratchSize, param->numMaxWaves,
+ param->scratch, param->hsa_queue);
+ disp->state = AQL_WRAP_BUSY;
+ releaseWaitEvents((__global AmdEvent**)(disp->wait_list),
+ disp->wait_num, (__global uint*)queue->event_slot_mask,
+ (__global AmdEvent*)queue->event_slots);
+ break;
}
}
- else if (slotState == AQL_WRAP_MARKER) {
- bool complete = false;
- if (disp->wait_num == 0) {
- uint minCommand = min_command(queue->aql_slot_num, wraps);
- if (disp->command_id == minCommand) {
- complete = true;
- }
+ }
+ else if (slotState == AQL_WRAP_MARKER) {
+ bool complete = false;
+ if (disp->wait_num == 0) {
+ uint minCommand = min_command(queue->aql_slot_num, wraps);
+ if (disp->command_id == minCommand) {
+ complete = true;
}
- else {
- // Check if the wait list is COMPLETE
- if (checkWaitEvents(
- (__global AmdEvent**)(disp->wait_list), disp->wait_num)) {
- complete = true;
- releaseWaitEvents((__global AmdEvent**)(disp->wait_list),
- disp->wait_num, (__global uint*)queue->event_slot_mask,
- (__global AmdEvent*)queue->event_slots);
- }
- }
- if (complete) {
- // Decrement the child execution counter on the parent
- atomic_fetch_sub_explicit(
- (__global atomic_uint*)&parent->child_counter,
- 1, memory_order_acq_rel, memory_scope_device);
- event->state = CL_COMPLETE;
- disp->state = AQL_WRAP_FREE;
- release_slot(amask, idx);
- releaseEvent(event, (__global uint*)queue->event_slot_mask,
+ }
+ else {
+ // Check if the wait list is COMPLETE
+ if (checkWaitEvents(
+ (__global AmdEvent**)(disp->wait_list), disp->wait_num)) {
+ complete = true;
+ releaseWaitEvents((__global AmdEvent**)(disp->wait_list),
+ disp->wait_num, (__global uint*)queue->event_slot_mask,
(__global AmdEvent*)queue->event_slots);
}
}
- else if (slotState == AQL_WRAP_DONE) {
- // Was CL_EVENT requested?
- if (event != 0) {
- // The current dispatch doesn't have any outstanding children
- if (disp->child_counter == 0) {
- event->state = CL_COMPLETE;
- event->timer[PROFILING_COMMAND_END] =
- event->timer[PROFILING_COMMAND_COMPLETE] =
- (__hsail_get_clock() * (ulong)param->eng_clk) >> 10;
- }
- else {
- event->timer[PROFILING_COMMAND_END] =
- (__hsail_get_clock() * (ulong)param->eng_clk) >> 10;
- }
- releaseEvent(event, (__global uint *)queue->event_slot_mask,
- (__global AmdEvent *)queue->event_slots);
- }
- // The current dispatch doesn't have any outstanding children
- if (disp->child_counter == 0) {
- // Decrement the child execution counter on the parent
- atomic_fetch_sub_explicit(
- (__global atomic_uint*)&parent->child_counter,
- 1, memory_order_acq_rel, memory_scope_device);
- disp->state = AQL_WRAP_FREE;
- release_slot(amask, idx);
- }
- }
- else if (slotState == AQL_WRAP_BUSY) {
- disp->state = AQL_WRAP_DONE;
+ if (complete) {
+ // Decrement the child execution counter on the parent
+ atomic_fetch_sub_explicit(
+ (__global atomic_uint*)&parent->child_counter,
+ 1, memory_order_acq_rel, memory_scope_device);
+ event->state = CL_COMPLETE;
+ disp->state = AQL_WRAP_FREE;
+ release_slot(amask, idx);
+ releaseEvent(event, (__global uint*)queue->event_slot_mask,
+ (__global AmdEvent*)queue->event_slots);
}
}
- if (launch == 1) break;
+ else if (slotState == AQL_WRAP_DONE) {
+ // Was CL_EVENT requested?
+ if (event != 0) {
+ // The current dispatch doesn't have any outstanding children
+ if (disp->child_counter == 0) {
+ event->state = CL_COMPLETE;
+ event->timer[PROFILING_COMMAND_END] =
+ event->timer[PROFILING_COMMAND_COMPLETE] =
+ (__hsail_get_clock() * (ulong)param->eng_clk) >> 10;
+ }
+ else {
+ event->timer[PROFILING_COMMAND_END] =
+ (__hsail_get_clock() * (ulong)param->eng_clk) >> 10;
+ }
+ releaseEvent(event, (__global uint *)queue->event_slot_mask,
+ (__global AmdEvent *)queue->event_slots);
+ }
+ // The current dispatch doesn't have any outstanding children
+ if (disp->child_counter == 0) {
+ // Decrement the child execution counter on the parent
+ atomic_fetch_sub_explicit(
+ (__global atomic_uint*)&parent->child_counter,
+ 1, memory_order_acq_rel, memory_scope_device);
+ disp->state = AQL_WRAP_FREE;
+ release_slot(amask, idx);
+ }
+ }
+ else if (slotState == AQL_WRAP_BUSY) {
+ disp->state = AQL_WRAP_DONE;
+ }
}
+
barrier(CLK_GLOBAL_MEM_FENCE);
launch = atomic_load_explicit((__global atomic_uint*)¶m->launch,
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp
index 363a96c6e6..ff00d8bed5 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp
@@ -453,8 +453,22 @@ VirtualGPU::create(
#endif // !cl_amd_open_video
{
if (dev().engines().numComputeRings()) {
- //!@note: Add 1 to account the device queue for transfers
- uint idx = (index() + 1) % dev().engines().numComputeRings();
+ uint idx;
+
+ //! @todo Temporary workaround for Linux, because 2 HW queues only
+ //! Fixes conformance failures with multi queues
+ if ((0 == deviceQueueSize) || IS_WINDOWS) {
+ idx = index() % (dev().engines().numComputeRings() -
+ gpuDevice_.numDeviceQueues_);
+ }
+ else {
+ gpuDevice_.numDeviceQueues_++;
+ if (gpuDevice_.numDeviceQueues_ >= dev().engines().numComputeRings()) {
+ return false;
+ }
+ idx = (dev().engines().numComputeRings() - gpuDevice_.numDeviceQueues_)
+ % dev().engines().numComputeRings();
+ }
// hwRing_ should be set 0 if forced to have single scratch buffer
hwRing_ = (dev().settings().useSingleScratch_) ? 0 : idx;
@@ -583,6 +597,10 @@ VirtualGPU::~VirtualGPU()
amd::ScopedLock k(dev().lockAsyncOps());
amd::ScopedLock lock(dev().vgpusAccess());
+ if ((NULL != virtualQueue_) && IS_LINUX) {
+ gpuDevice_.numDeviceQueues_--;
+ }
+
uint i;
// Destroy all kernels
for (GslKernels::const_iterator it = gslKernels_.begin();