From 4e5c499ace5829b9fa5c8a52be400a8bfb1ed881 Mon Sep 17 00:00:00 2001
From: foreman <dl.constructicon@amd.com>
Date: Wed, 19 Aug 2015 12:37:48 -0400
Subject: [PATCH] P4 to Git Change 1182228 by gandryey@gera-dev-w7 on
 2015/08/19 12:22:39

	EPR #419072 - [OpenCL2.0] Enable 16MB large on device queues
	- Add mask_groups argument into the library for the empty slot spreading

Affected files ...

... //depot/stg/opencl/drivers/opencl/library/hsa/hsail/src/devenq/devenq.h#11 edit
... //depot/stg/opencl/drivers/opencl/library/hsa/hsail/src/devenq/enqueue.cl#10 edit
... //depot/stg/opencl/drivers/opencl/library/hsa/hsail/src/devenq/eprep.cl#6 edit
... //depot/stg/opencl/drivers/opencl/library/hsa/hsail/src/devenq/events.cl#4 edit
... //depot/stg/opencl/drivers/opencl/library/hsa/hsail/src/devenq/schedule.cl#11 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#520 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusched.hpp#18 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#373 edit
---
 rocclr/runtime/device/gpu/gpudevice.cpp  | 2 +-
 rocclr/runtime/device/gpu/gpusched.hpp   | 5 ++---
 rocclr/runtime/device/gpu/gpuvirtual.cpp | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/rocclr/runtime/device/gpu/gpudevice.cpp b/rocclr/runtime/device/gpu/gpudevice.cpp
index 463d32be16..b48d497bc0 100644
--- a/rocclr/runtime/device/gpu/gpudevice.cpp
+++ b/rocclr/runtime/device/gpu/gpudevice.cpp
@@ -489,7 +489,7 @@ void NullDevice::fillDeviceInfo(
         info_.queueOnDeviceProperties_ =
             CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE;
         info_.queueOnDevicePreferredSize_ = 256 * Ki;
-        info_.queueOnDeviceMaxSize_ = 12 * Mi;
+        info_.queueOnDeviceMaxSize_ = 16 * Mi;
         info_.maxOnDeviceQueues_ = 1;
         info_.maxOnDeviceEvents_ = settings().numDeviceEvents_;
         info_.globalVariablePreferredTotalSize_ = static_cast<size_t>(info_.globalMemSize_);
diff --git a/rocclr/runtime/device/gpu/gpusched.hpp b/rocclr/runtime/device/gpu/gpusched.hpp
index 8776957726..3d5bace02b 100644
--- a/rocclr/runtime/device/gpu/gpusched.hpp
+++ b/rocclr/runtime/device/gpu/gpusched.hpp
@@ -28,7 +28,7 @@ struct AmdVQueueHeader {
     uint32_t command_counter;   //!< [LRW] The global counter for the submitted commands into the queue
     uint32_t wait_size;         //!< [LRO] The wait list size (in clk_event_t)
     uint32_t arg_size;          //!< [LRO] The size of argument buffer (in bytes)
-    uint32_t reserved0;         //!< For the future usage
+    uint32_t mask_groups;       //!< Processed mask groups by one thread
     uint64_t kernel_table;      //!< [LRO] Pointer to an array with all kernel objects (ulong for each entry)
     uint32_t reserved[2];       //!< For the future usage
 };
@@ -70,8 +70,7 @@ struct SchedulerParam {
     uint64_t    parentAQL;      //!< Host parent AmdAqlWrap packet
     uint32_t    dedicatedQueue; //!< Scheduler uses a dedicated queue
     uint32_t    scratchOffset;  //!< Scratch buffer offset
-    uint32_t    mask_groups;    //!< Processed mask groups by one thread
-    uint32_t    reserved;       //!< Reserved
+    uint32_t    reserved[2];    //!< Reserved
 };
 
 } // namespace gpu
diff --git a/rocclr/runtime/device/gpu/gpuvirtual.cpp b/rocclr/runtime/device/gpu/gpuvirtual.cpp
index 74dee7aa67..19a591e4e8 100644
--- a/rocclr/runtime/device/gpu/gpuvirtual.cpp
+++ b/rocclr/runtime/device/gpu/gpuvirtual.cpp
@@ -358,6 +358,7 @@ VirtualGPU::createVirtualQueue(uint deviceQueueSize)
     header->aql_slot_mask   = vaBase + slotMaskOffs;
     header->wait_size       = dev().settings().numWaitEvents_;
     header->arg_size        = dev().info().maxParameterSize_ + 64;
+    header->mask_groups     = maskGroups_;
     vqHeader_ = new AmdVQueueHeader;
     if (NULL == vqHeader_) {
         return false;
@@ -1938,7 +1939,6 @@ VirtualGPU::submitKernelInternalHSA(
             param->parentAQL = vmParentWrap;
             param->dedicatedQueue = dev().settings().useDeviceQueue_;
             param->useATC = dev().settings().svmFineGrainSystem_;
-            param->mask_groups = maskGroups_;
 
             // Fill the scratch buffer information
             if (hsaKernel.prog().maxScratchRegs() > 0) {