From 5994573b3fde49b7d64bb00e323a92104f5769f5 Mon Sep 17 00:00:00 2001
From: foreman
Date: Tue, 19 Dec 2017 13:29:53 -0500
Subject: [PATCH] P4 to Git Change 1496286 by gandryey@gera-w8 on 2017/12/19
13:24:13
SWDEV-140401 - [CQE OCL][PAL][Vega10] Observed ~ 80 % performance drop while running GE_HealthCare - PenalityTest
- Port the logic to consider cache line size for the local workgroup detection.
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#41 edit
---
rocclr/runtime/device/pal/palkernel.cpp | 37 +++++++++++++++++++------
1 file changed, 28 insertions(+), 9 deletions(-)
diff --git a/rocclr/runtime/device/pal/palkernel.cpp b/rocclr/runtime/device/pal/palkernel.cpp
index 3fbb6c0b9b..cc081d7db2 100644
--- a/rocclr/runtime/device/pal/palkernel.cpp
+++ b/rocclr/runtime/device/pal/palkernel.cpp
@@ -813,10 +813,15 @@ void HSAILKernel::findLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkS
tmp /= div;
}
+ // Assuming DWORD access
+ const uint cacheLineMatch = dev().settings().cacheLineSize_ >> 2;
+
// Check if partial dispatch is enabled and
if (dev().settings().partialDispatch_ &&
- // we couldn't find optimal workload
- (lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) {
+ // we couldn't find optimal workload
+ (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) ||
+ // or size is too small for the cache line
+ (lclWorkSize[0] < cacheLineMatch))) {
size_t maxSize = 0;
size_t maxDim = 0;
for (uint d = 0; d < workDim; ++d) {
@@ -825,14 +830,28 @@ void HSAILKernel::findLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkS
maxDim = d;
}
}
- // Check if a local workgroup has the most optimal size
- if (thrPerGrp > maxSize) {
- thrPerGrp = maxSize;
+ // Use X dimension as high priority. Runtime will assume that
+ // X dimension is more important for the address calculation
+ if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) {
+ lclWorkSize[0] = cacheLineMatch;
+ thrPerGrp /= cacheLineMatch;
+ lclWorkSize[maxDim] = thrPerGrp;
+ for (uint d = 1; d < workDim; ++d) {
+ if (d != maxDim) {
+ lclWorkSize[d] = 1;
+ }
+ }
}
- lclWorkSize[maxDim] = thrPerGrp;
- for (uint d = 0; d < workDim; ++d) {
- if (d != maxDim) {
- lclWorkSize[d] = 1;
+ else {
+ // Check if a local workgroup has the most optimal size
+ if (thrPerGrp > maxSize) {
+ thrPerGrp = maxSize;
+ }
+ lclWorkSize[maxDim] = thrPerGrp;
+ for (uint d = 0; d < workDim; ++d) {
+ if (d != maxDim) {
+ lclWorkSize[d] = 1;
+ }
}
}
}