From cd3fefb00db00b000228d1ae66460e27e4550436 Mon Sep 17 00:00:00 2001
From: foreman <dl.constructicon@amd.com>
Date: Mon, 7 Jul 2014 18:58:52 -0400
Subject: [PATCH] P4 to Git Change 1052832 by gandryey@gera-dev-w7 on
 2014/07/07 18:44:29

	ECR #304775 - Device enqueuing
	- Update the scheduler to handle event mask

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuschedcl.cpp#18 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#320 edit
---
 rocclr/runtime/device/gpu/gpuschedcl.cpp | 52 +++++++++++++++++++-----
 rocclr/runtime/device/gpu/gpuvirtual.cpp | 11 +++--
 2 files changed, 50 insertions(+), 13 deletions(-)

diff --git a/rocclr/runtime/device/gpu/gpuschedcl.cpp b/rocclr/runtime/device/gpu/gpuschedcl.cpp
index 50e0d61db2..7503532e01 100644
--- a/rocclr/runtime/device/gpu/gpuschedcl.cpp
+++ b/rocclr/runtime/device/gpu/gpuschedcl.cpp
@@ -164,7 +164,8 @@ const uint StallExecution = 0x00000000; // 0x01000000
 const uint WavefrontSize = 64;
 const uint MaxWaveSize = 0x400;
 
-void dispatch(
+static inline void 
+dispatch(
     volatile __global HwDispatch*   dispatch,
     __global HsaAqlDispatchPacket*  aqlPkt,
     uint                            scratchSize,
@@ -272,20 +273,21 @@ void dispatch(
     dispatch->startExe = ResumeExecution;
 }
 
-bool
-checkWaitEvents(__global AmdEvent* events, uint numEvents)
+static inline bool
+checkWaitEvents(__global AmdEvent** events, uint numEvents)
 {
     for (uint i = 0; i < numEvents; ++i) {
-        if (atomic_and(&events[i].state, 0xffffffff) != CL_COMPLETE) {
+        if (atomic_and(&events[i]->state, 0xffffffff) != CL_COMPLETE) {
             return false;
         }
     }
     return true;
 }
 
+
 // release slot in a bitmask controlled resource i is the slot number
 static inline void
-release_slot(__global uint * restrict mask, uint i)
+release_slot(__global uint* restrict mask, uint i)
 {
     /* uint b = ~(1UL << (i & 0x1f)); */
     uint b = ~amd_bfm(1U, i);
@@ -301,6 +303,26 @@ release_slot(__global uint * restrict mask, uint i)
     }
 }
 
+static inline void
+releaseEvent(__global AmdEvent* ev, __global uint* emask, __global AmdEvent* eb)
+{
+    uint c = atomic_fetch_sub_explicit((__global atomic_uint *)&ev->counter, 1U,
+        memory_order_acq_rel, memory_scope_device);
+    if (c == 1U) {
+        uint i = ev - eb;
+        release_slot(emask, i);
+    }
+}
+
+static inline void
+releaseWaitEvents(__global AmdEvent** events, uint numEvents,
+    __global uint* emask, __global AmdEvent* eb)
+{
+    for (uint i = 0; i < numEvents; ++i) {
+        releaseEvent(events[i], emask, eb);
+    }
+}
+
 static inline uint
 min_command(uint slot_num, __global AmdAqlWrap* wraps)
 {
@@ -370,7 +392,7 @@ scheduler(
                         if (disp->wait_num != 0) {
                             // Check if the wait list is COMPLETE
                             launch = checkWaitEvents(
-                                (__global AmdEvent*)(disp->wait_list), disp->wait_num);
+                                (__global AmdEvent**)(disp->wait_list), disp->wait_num);
                         }
                         else {
                             launch = 1;
@@ -381,12 +403,15 @@ scheduler(
                             memory_order_acq_rel, memory_order_acq_rel, memory_scope_device)) {
                             if (event != 0) {
                                 event->timer[PROFILING_COMMAND_START] =
-                                    (__hsail_get_clock() * 1000) / (ulong)param->eng_clk;
+                                    (__hsail_get_clock() * (ulong)param->eng_clk) >> 10;
                             }
                             // Launch child kernel ....
                             dispatch(hwDisp, &disp->aql, param->scratchSize, param->numMaxWaves,
                                 param->scratch, param->hsa_queue);
                             disp->state = AQL_WRAP_BUSY;
+                            releaseWaitEvents((__global AmdEvent**)(disp->wait_list),
+                                disp->wait_num, (__global uint*)queue->event_slot_mask,
+                                (__global AmdEvent*)queue->event_slots);
                             break;
                         }
                     }
@@ -402,8 +427,11 @@ scheduler(
                     else {
                         // Check if the wait list is COMPLETE
                         if (checkWaitEvents(
-                            (__global AmdEvent*)(disp->wait_list), disp->wait_num)) {
+                            (__global AmdEvent**)(disp->wait_list), disp->wait_num)) {
                             complete = true;
+                            releaseWaitEvents((__global AmdEvent**)(disp->wait_list),
+                                disp->wait_num, (__global uint*)queue->event_slot_mask,
+                                (__global AmdEvent*)queue->event_slots);
                         }
                     }
                     if (complete) {
@@ -414,6 +442,8 @@ scheduler(
                         event->state = CL_COMPLETE;
                         disp->state = AQL_WRAP_FREE;
                         release_slot(amask, idx);
+                        releaseEvent(event, (__global uint*)queue->event_slot_mask,
+                            (__global AmdEvent*)queue->event_slots);
                     }
                 }
                 else if (slotState == AQL_WRAP_DONE) {
@@ -424,12 +454,14 @@ scheduler(
                             event->state = CL_COMPLETE;
                             event->timer[PROFILING_COMMAND_END] =
                             event->timer[PROFILING_COMMAND_COMPLETE] =
-                                (__hsail_get_clock() * 1000) / (ulong)param->eng_clk;
+                                (__hsail_get_clock() * (ulong)param->eng_clk) >> 10;
                         }
                         else {
                             event->timer[PROFILING_COMMAND_END] = 
-                                (__hsail_get_clock() * 1000) / (ulong)param->eng_clk;
+                                (__hsail_get_clock() * (ulong)param->eng_clk) >> 10;
                         }
+                        releaseEvent(event, (__global uint *)queue->event_slot_mask,
+                            (__global AmdEvent *)queue->event_slots);
                     }
                     // The current dispatch doesn't have any outstanding children
                     if (disp->child_counter == 0) {
diff --git a/rocclr/runtime/device/gpu/gpuvirtual.cpp b/rocclr/runtime/device/gpu/gpuvirtual.cpp
index cba81a9e4c..0e6b55cd65 100644
--- a/rocclr/runtime/device/gpu/gpuvirtual.cpp
+++ b/rocclr/runtime/device/gpu/gpuvirtual.cpp
@@ -284,11 +284,11 @@ VirtualGPU::createVirtualQueue(uint deviceQueueSize)
 
     uint    eventMaskOffs = allocSize;
     // Add mask array for events
-    allocSize += amd::alignUp(dev().settings().numDeviceEvents_, 32) / 32;
+    allocSize += amd::alignUp(dev().settings().numDeviceEvents_, 32) / 8;
 
     uint    slotMaskOffs = allocSize;
     // Add mask array for AmdAqlWrap slots
-    allocSize += amd::alignUp(numSlots, 32) / 32;
+    allocSize += amd::alignUp(numSlots, 32) / 8;
 
     virtualQueue_ = new Memory(dev(), allocSize);
     Resource::MemoryType type = (GPU_PRINT_CHILD_KERNEL == 0) ?
@@ -1680,6 +1680,10 @@ VirtualGPU::submitKernelInternalHSA(
             gpuDefQueue = static_cast<VirtualGPU*>(defQueue->vDev());
         }
         vmDefQueue = gpuDefQueue->virtualQueue_->vmAddress();
+        if (gpuDefQueue->hwRing() == hwRing()) {
+            LogError("Can't submit the child kernels to the same HW ring as the host queue!");
+            return false;
+        }
 
         // Add memory handles before the actual dispatch
         memList.push_back(gpuDefQueue->virtualQueue_);
@@ -1811,7 +1815,8 @@ VirtualGPU::submitKernelInternalHSA(
         SchedulerParam* param = &reinterpret_cast<SchedulerParam*>
             (gpuDefQueue->schedParams_->data())[gpuDefQueue->schedParamIdx_];
         param->signal = 1;
-        param->eng_clk = dev().info().maxClockFrequency_;
+        // Scale clock to 1024 to avoid 64 bit div in the scheduler
+        param->eng_clk = (1000 * 1024) / dev().info().maxClockFrequency_;
         param->hw_queue = patchStart + sizeof(uint32_t)/* Rewind packet*/;
         param->hsa_queue = gpuDefQueue->hsaQueueMem()->vmAddress();
         param->launch = 0;