From 1dcac07a7eda127a3a24ef516370d91abd38f9fa Mon Sep 17 00:00:00 2001
From: foreman <dl.constructicon@amd.com>
Date: Thu, 16 Aug 2018 17:49:03 -0400
Subject: [PATCH] P4 to Git Change 1594730 by asalmanp@asalmanp-ocl-stg on
 2018/08/16 17:32:14

	SWDEV-160930 - SPECworkstation 3 benchmark GPU Compute tests fail
	Root cause: Caffe compute benchmark fails within SPECWorkstation app because one of the Caffe's OCL kernel tries to launch a kernel with the local_work_size of 1024 causing the clEnqueueNDRangeKernel API to return CL_INVALID_WORK_GROUP_SIZE (i.e., the maximum allowable number is 256)
	Proposed workaround: In order to run a kernel with a local_work_size of 1024, we check the number of used VGPRs in the Kernel and if the Kernel is not using all the available VGPRs we let the Kernel to use 1024 as the local_work_size.

	ReviewURLBoard = http://ocltc.amd.com/reviews/r/15638/

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#58 edit
---
 rocclr/runtime/device/pal/palkernel.cpp | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/rocclr/runtime/device/pal/palkernel.cpp b/rocclr/runtime/device/pal/palkernel.cpp
index f35ac18b7a..20c06046a8 100644
--- a/rocclr/runtime/device/pal/palkernel.cpp
+++ b/rocclr/runtime/device/pal/palkernel.cpp
@@ -716,7 +716,10 @@ bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) {
     workGroupInfo_.size_ = workGroupInfo_.compileSize_[0] * workGroupInfo_.compileSize_[1] *
         workGroupInfo_.compileSize_[2];
   } else {
-    workGroupInfo_.size_ = dev().info().preferredWorkGroupSize_;
+      size_t nItems = (workGroupInfo_.availableVGPRs_ / workGroupInfo_.usedVGPRs_) *
+        dev().hwInfo()->simdPerCU_ * workGroupInfo_.wavefrontSize_;
+      workGroupInfo_.size_ = nItems > dev().info().preferredWorkGroupSize_ ?
+        std::min(size_t(1024) , nItems) : dev().info().preferredWorkGroupSize_;
   }
 
   // Pull out printf metadata from the ELF
@@ -1435,7 +1438,12 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
   // Copy wavefront size
   workGroupInfo_.wavefrontSize_ = dev().info().wavefrontWidth_;
 
-  workGroupInfo_.size_ = kernelMD->mCodeProps.mMaxFlatWorkGroupSize;
+
+  size_t nItems = (workGroupInfo_.availableVGPRs_ / workGroupInfo_.usedVGPRs_) *
+    dev().hwInfo()->simdPerCU_ * workGroupInfo_.wavefrontSize_;
+  workGroupInfo_.size_ = nItems > kernelMD->mCodeProps.mMaxFlatWorkGroupSize ?
+    std::min(size_t(1024), nItems) : kernelMD->mCodeProps.mMaxFlatWorkGroupSize;
+
   if (workGroupInfo_.size_ == 0) {
     return false;
   }