diff --git a/rocclr/runtime/device/pal/palkernel.cpp b/rocclr/runtime/device/pal/palkernel.cpp index 3fbb6c0b9b..cc081d7db2 100644 --- a/rocclr/runtime/device/pal/palkernel.cpp +++ b/rocclr/runtime/device/pal/palkernel.cpp @@ -813,10 +813,15 @@ void HSAILKernel::findLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkS tmp /= div; } + // Assuming DWORD access + const uint cacheLineMatch = dev().settings().cacheLineSize_ >> 2; + // Check if partial dispatch is enabled and if (dev().settings().partialDispatch_ && - // we couldn't find optimal workload - (lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) { + // we couldn't find optimal workload + (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) || + // or size is too small for the cache line + (lclWorkSize[0] < cacheLineMatch))) { size_t maxSize = 0; size_t maxDim = 0; for (uint d = 0; d < workDim; ++d) { @@ -825,14 +830,28 @@ void HSAILKernel::findLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkS maxDim = d; } } - // Check if a local workgroup has the most optimal size - if (thrPerGrp > maxSize) { - thrPerGrp = maxSize; + // Use X dimension as high priority. Runtime will assume that + // X dimension is more important for the address calculation + if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) { + lclWorkSize[0] = cacheLineMatch; + thrPerGrp /= cacheLineMatch; + lclWorkSize[maxDim] = thrPerGrp; + for (uint d = 1; d < workDim; ++d) { + if (d != maxDim) { + lclWorkSize[d] = 1; + } + } } - lclWorkSize[maxDim] = thrPerGrp; - for (uint d = 0; d < workDim; ++d) { - if (d != maxDim) { - lclWorkSize[d] = 1; + else { + // Check if a local workgroup has the most optimal size + if (thrPerGrp > maxSize) { + thrPerGrp = maxSize; + } + lclWorkSize[maxDim] = thrPerGrp; + for (uint d = 0; d < workDim; ++d) { + if (d != maxDim) { + lclWorkSize[d] = 1; + } } } }