From 977f314094e90f7889ff4831b747c7b503ddd9e7 Mon Sep 17 00:00:00 2001
From: foreman <dl.constructicon@amd.com>
Date: Tue, 8 Jul 2014 15:30:04 -0400
Subject: [PATCH] P4 to Git Change 1053268 by gandryey@gera-dev-w7 on
 2014/07/08 15:08:51

	ECR #304775 - Device enqueuing
	- Add printing of the waiting events
	- Add early exit in the scheduler if nothing to launch

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuschedcl.cpp#19 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#321 edit
---
 rocclr/runtime/device/gpu/gpuschedcl.cpp | 15 ++++++---------
 rocclr/runtime/device/gpu/gpuvirtual.cpp | 14 +++++++++++++-
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/rocclr/runtime/device/gpu/gpuschedcl.cpp b/rocclr/runtime/device/gpu/gpuschedcl.cpp
index 7503532e01..0708bf975e 100644
--- a/rocclr/runtime/device/gpu/gpuschedcl.cpp
+++ b/rocclr/runtime/device/gpu/gpuschedcl.cpp
@@ -388,15 +388,12 @@ scheduler(
                             continue;
                         }
 
-                        // Check if the command has any the wait events
-                        if (disp->wait_num != 0) {
-                            // Check if the wait list is COMPLETE
-                            launch = checkWaitEvents(
-                                (__global AmdEvent**)(disp->wait_list), disp->wait_num);
-                        }
-                        else {
-                            launch = 1;
-                        }
+                        // Check if the wait list is COMPLETE
+                        launch = checkWaitEvents(
+                            (__global AmdEvent**)(disp->wait_list), disp->wait_num);
+
+                        if (launch == 0) continue;
+
                         uint tmp = 0;
                         if (atomic_compare_exchange_strong_explicit(
                             (__global atomic_uint*)&param->launch, &tmp, launch,
diff --git a/rocclr/runtime/device/gpu/gpuvirtual.cpp b/rocclr/runtime/device/gpu/gpuvirtual.cpp
index 0e6b55cd65..9a4497be2c 100644
--- a/rocclr/runtime/device/gpu/gpuvirtual.cpp
+++ b/rocclr/runtime/device/gpu/gpuvirtual.cpp
@@ -1734,6 +1734,7 @@ VirtualGPU::submitKernelInternalHSA(
             uint p = 0;
             for (uint i = 0; i < gpuDefQueue->vqHeader_->aql_slot_num; ++i) {
                 if (wraps[i].state != 0) {
+                    uint j;
                     if (p == GPU_PRINT_CHILD_KERNEL) {
                         break;
                     }
@@ -1748,6 +1749,18 @@ VirtualGPU::submitKernelInternalHSA(
                     print << "\tparent_wrap: "    << wraps[i].parent_wrap     << "\n";
                     print << "\twait_list: "      << wraps[i].wait_list       << "\n";
                     print << "\twait_num: "       << wraps[i].wait_num        << "\n";
+                    uint offsEvents = wraps[i].wait_list -
+                        gpuDefQueue->virtualQueue_->vmAddress();
+                    size_t* events = reinterpret_cast<size_t*>(
+                        gpuDefQueue->virtualQueue_->data() + offsEvents);
+                    for (j = 0; j < wraps[i].wait_num; ++j) {
+                        uint offs = static_cast<uint64_t>(events[j]) -
+                            gpuDefQueue->virtualQueue_->vmAddress();
+                        AmdEvent* eventD = (AmdEvent*)(gpuDefQueue->virtualQueue_->data() + offs);
+                        print << "Wait Event#: " << j << "\n";
+                        print << "\tState: " << eventD->state <<
+                                 "; Counter: " << eventD->counter << "\n";
+                    }
                     print << "WorkGroupSize[ " << wraps[i].aql.workgroup_size[0] << ", ";
                     print << wraps[i].aql.workgroup_size[1] << ", ";
                     print << wraps[i].aql.workgroup_size[2] << "]\n";
@@ -1757,7 +1770,6 @@ VirtualGPU::submitKernelInternalHSA(
 
                     uint64_t* kernels = (uint64_t*)(
                         const_cast<Memory*>(hsaKernel.prog().kernelTable())->map(this));
-                    uint j;
                     for (j = 0; j < hsaKernel.prog().kernels().size(); ++j) {
                         if (kernels[j] == wraps[i].aql.kernel_object_address) {
                             break;