diff --git a/projects/clr/rocclr/device/devkernel.cpp b/projects/clr/rocclr/device/devkernel.cpp index e598dca74f..14b65047f7 100644 --- a/projects/clr/rocclr/device/devkernel.cpp +++ b/projects/clr/rocclr/device/devkernel.cpp @@ -711,42 +711,44 @@ void Kernel::FindLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize, tmp /= div; } - // Assuming DWORD access - const uint cacheLineMatch = device().info().globalMemCacheLineSize_ >> 2; + if (!workGroupInfo()->uniformWorkGroupSize_) { + // Assuming DWORD access + const uint cacheLineMatch = device().info().globalMemCacheLineSize_ >> 2; - // Check if we couldn't find optimal workload - if (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) || - // or size is too small for the cache line - (lclWorkSize[0] < cacheLineMatch)) { - size_t maxSize = 0; - size_t maxDim = 0; - for (uint d = 0; d < workDim; ++d) { - if (maxSize < gblWorkSize[d]) { - maxSize = gblWorkSize[d]; - maxDim = d; - } - } - // Use X dimension as high priority. Runtime will assume that - // X dimension is more important for the address calculation - if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) { - lclWorkSize[0] = cacheLineMatch; - thrPerGrp /= cacheLineMatch; - lclWorkSize[maxDim] = thrPerGrp; - for (uint d = 1; d < workDim; ++d) { - if (d != maxDim) { - lclWorkSize[d] = 1; + // Check if we couldn't find optimal workload + if (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) || + // or size is too small for the cache line + (lclWorkSize[0] < cacheLineMatch)) { + size_t maxSize = 0; + size_t maxDim = 0; + for (uint d = 0; d < workDim; ++d) { + if (maxSize < gblWorkSize[d]) { + maxSize = gblWorkSize[d]; + maxDim = d; } } - } - else { - // Check if a local workgroup has the most optimal size - if (thrPerGrp > maxSize) { - thrPerGrp = maxSize; + // Use X dimension as high priority. Runtime will assume that + // X dimension is more important for the address calculation + if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) { + lclWorkSize[0] = cacheLineMatch; + thrPerGrp /= cacheLineMatch; + lclWorkSize[maxDim] = thrPerGrp; + for (uint d = 1; d < workDim; ++d) { + if (d != maxDim) { + lclWorkSize[d] = 1; + } + } } - lclWorkSize[maxDim] = thrPerGrp; - for (uint d = 0; d < workDim; ++d) { - if (d != maxDim) { - lclWorkSize[d] = 1; + else { + // Check if a local workgroup has the most optimal size + if (thrPerGrp > maxSize) { + thrPerGrp = maxSize; + } + lclWorkSize[maxDim] = thrPerGrp; + for (uint d = 0; d < workDim; ++d) { + if (d != maxDim) { + lclWorkSize[d] = 1; + } } } }