From 5994573b3fde49b7d64bb00e323a92104f5769f5 Mon Sep 17 00:00:00 2001 From: foreman Date: Tue, 19 Dec 2017 13:29:53 -0500 Subject: [PATCH] P4 to Git Change 1496286 by gandryey@gera-w8 on 2017/12/19 13:24:13 SWDEV-140401 - [CQE OCL][PAL][Vega10] Observed ~ 80 % performance drop while running GE_HealthCare - PenalityTest - Port the logic to consider cache line size for the local workgroup detection. Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#41 edit --- rocclr/runtime/device/pal/palkernel.cpp | 37 +++++++++++++++++++------ 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/rocclr/runtime/device/pal/palkernel.cpp b/rocclr/runtime/device/pal/palkernel.cpp index 3fbb6c0b9b..cc081d7db2 100644 --- a/rocclr/runtime/device/pal/palkernel.cpp +++ b/rocclr/runtime/device/pal/palkernel.cpp @@ -813,10 +813,15 @@ void HSAILKernel::findLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkS tmp /= div; } + // Assuming DWORD access + const uint cacheLineMatch = dev().settings().cacheLineSize_ >> 2; + // Check if partial dispatch is enabled and if (dev().settings().partialDispatch_ && - // we couldn't find optimal workload - (lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) { + // we couldn't find optimal workload + (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) || + // or size is too small for the cache line + (lclWorkSize[0] < cacheLineMatch))) { size_t maxSize = 0; size_t maxDim = 0; for (uint d = 0; d < workDim; ++d) { @@ -825,14 +830,28 @@ void HSAILKernel::findLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkS maxDim = d; } } - // Check if a local workgroup has the most optimal size - if (thrPerGrp > maxSize) { - thrPerGrp = maxSize; + // Use X dimension as high priority. Runtime will assume that + // X dimension is more important for the address calculation + if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) { + lclWorkSize[0] = cacheLineMatch; + thrPerGrp /= cacheLineMatch; + lclWorkSize[maxDim] = thrPerGrp; + for (uint d = 1; d < workDim; ++d) { + if (d != maxDim) { + lclWorkSize[d] = 1; + } + } } - lclWorkSize[maxDim] = thrPerGrp; - for (uint d = 0; d < workDim; ++d) { - if (d != maxDim) { - lclWorkSize[d] = 1; + else { + // Check if a local workgroup has the most optimal size + if (thrPerGrp > maxSize) { + thrPerGrp = maxSize; + } + lclWorkSize[maxDim] = thrPerGrp; + for (uint d = 0; d < workDim; ++d) { + if (d != maxDim) { + lclWorkSize[d] = 1; + } } } }