From 4e5c499ace5829b9fa5c8a52be400a8bfb1ed881 Mon Sep 17 00:00:00 2001 From: foreman Date: Wed, 19 Aug 2015 12:37:48 -0400 Subject: [PATCH] P4 to Git Change 1182228 by gandryey@gera-dev-w7 on 2015/08/19 12:22:39 EPR #419072 - [OpenCL2.0] Enable 16MB large on device queues - Add mask_groups argument into the library for the empty slot spreading Affected files ... ... //depot/stg/opencl/drivers/opencl/library/hsa/hsail/src/devenq/devenq.h#11 edit ... //depot/stg/opencl/drivers/opencl/library/hsa/hsail/src/devenq/enqueue.cl#10 edit ... //depot/stg/opencl/drivers/opencl/library/hsa/hsail/src/devenq/eprep.cl#6 edit ... //depot/stg/opencl/drivers/opencl/library/hsa/hsail/src/devenq/events.cl#4 edit ... //depot/stg/opencl/drivers/opencl/library/hsa/hsail/src/devenq/schedule.cl#11 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#520 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusched.hpp#18 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#373 edit --- rocclr/runtime/device/gpu/gpudevice.cpp | 2 +- rocclr/runtime/device/gpu/gpusched.hpp | 5 ++--- rocclr/runtime/device/gpu/gpuvirtual.cpp | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/rocclr/runtime/device/gpu/gpudevice.cpp b/rocclr/runtime/device/gpu/gpudevice.cpp index 463d32be16..b48d497bc0 100644 --- a/rocclr/runtime/device/gpu/gpudevice.cpp +++ b/rocclr/runtime/device/gpu/gpudevice.cpp @@ -489,7 +489,7 @@ void NullDevice::fillDeviceInfo( info_.queueOnDeviceProperties_ = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE; info_.queueOnDevicePreferredSize_ = 256 * Ki; - info_.queueOnDeviceMaxSize_ = 12 * Mi; + info_.queueOnDeviceMaxSize_ = 16 * Mi; info_.maxOnDeviceQueues_ = 1; info_.maxOnDeviceEvents_ = settings().numDeviceEvents_; info_.globalVariablePreferredTotalSize_ = static_cast(info_.globalMemSize_); diff --git a/rocclr/runtime/device/gpu/gpusched.hpp b/rocclr/runtime/device/gpu/gpusched.hpp index 8776957726..3d5bace02b 100644 --- a/rocclr/runtime/device/gpu/gpusched.hpp +++ b/rocclr/runtime/device/gpu/gpusched.hpp @@ -28,7 +28,7 @@ struct AmdVQueueHeader { uint32_t command_counter; //!< [LRW] The global counter for the submitted commands into the queue uint32_t wait_size; //!< [LRO] The wait list size (in clk_event_t) uint32_t arg_size; //!< [LRO] The size of argument buffer (in bytes) - uint32_t reserved0; //!< For the future usage + uint32_t mask_groups; //!< Processed mask groups by one thread uint64_t kernel_table; //!< [LRO] Pointer to an array with all kernel objects (ulong for each entry) uint32_t reserved[2]; //!< For the future usage }; @@ -70,8 +70,7 @@ struct SchedulerParam { uint64_t parentAQL; //!< Host parent AmdAqlWrap packet uint32_t dedicatedQueue; //!< Scheduler uses a dedicated queue uint32_t scratchOffset; //!< Scratch buffer offset - uint32_t mask_groups; //!< Processed mask groups by one thread - uint32_t reserved; //!< Reserved + uint32_t reserved[2]; //!< Reserved }; } // namespace gpu diff --git a/rocclr/runtime/device/gpu/gpuvirtual.cpp b/rocclr/runtime/device/gpu/gpuvirtual.cpp index 74dee7aa67..19a591e4e8 100644 --- a/rocclr/runtime/device/gpu/gpuvirtual.cpp +++ b/rocclr/runtime/device/gpu/gpuvirtual.cpp @@ -358,6 +358,7 @@ VirtualGPU::createVirtualQueue(uint deviceQueueSize) header->aql_slot_mask = vaBase + slotMaskOffs; header->wait_size = dev().settings().numWaitEvents_; header->arg_size = dev().info().maxParameterSize_ + 64; + header->mask_groups = maskGroups_; vqHeader_ = new AmdVQueueHeader; if (NULL == vqHeader_) { return false; @@ -1938,7 +1939,6 @@ VirtualGPU::submitKernelInternalHSA( param->parentAQL = vmParentWrap; param->dedicatedQueue = dev().settings().useDeviceQueue_; param->useATC = dev().settings().svmFineGrainSystem_; - param->mask_groups = maskGroups_; // Fill the scratch buffer information if (hsaKernel.prog().maxScratchRegs() > 0) {