SWDEV-407533 - [ABI Break]Purge unused env vars
Change-Id: I627950e8ebb6299affc602754a20d442dbe42b14
[ROCm/clr commit: 077311153a]
This commit is contained in:
committed by
German Andryeyev
orang tua
98b3f79d45
melakukan
3f4bbcfdba
@@ -684,102 +684,82 @@ void Kernel::FindLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize,
|
||||
if (workGroupInfo()->compileSize_[0] == 0) {
|
||||
// Find the default local workgroup size, if it wasn't specified
|
||||
if (lclWorkSize[0] == 0) {
|
||||
if ((device().settings().overrideLclSet & (1 << (workDim - 1))) == 0) {
|
||||
// Find threads per group
|
||||
size_t thrPerGrp = workGroupInfo()->size_;
|
||||
// Find threads per group
|
||||
size_t thrPerGrp = workGroupInfo()->size_;
|
||||
|
||||
// Check if kernel uses images
|
||||
if (flags_.imageEna_ &&
|
||||
// and thread group is a multiple value of wavefronts
|
||||
((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
|
||||
// and it's 2 or 3-dimensional workload
|
||||
(workDim > 1) && (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0))) {
|
||||
// Use 8x8 workgroup size if kernel has image writes
|
||||
if (flags_.imageWriteEna_ || (thrPerGrp != device().info().preferredWorkGroupSize_)) {
|
||||
lclWorkSize[0] = 8;
|
||||
lclWorkSize[1] = 8;
|
||||
}
|
||||
else {
|
||||
lclWorkSize[0] = 16;
|
||||
lclWorkSize[1] = 16;
|
||||
}
|
||||
if (workDim == 3) {
|
||||
lclWorkSize[2] = 1;
|
||||
}
|
||||
// Check if kernel uses images
|
||||
if (flags_.imageEna_ &&
|
||||
// and thread group is a multiple value of wavefronts
|
||||
((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
|
||||
// and it's 2 or 3-dimensional workload
|
||||
(workDim > 1) && (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0))) {
|
||||
// Use 8x8 workgroup size if kernel has image writes
|
||||
if (flags_.imageWriteEna_ || (thrPerGrp != device().info().preferredWorkGroupSize_)) {
|
||||
lclWorkSize[0] = 8;
|
||||
lclWorkSize[1] = 8;
|
||||
}
|
||||
else {
|
||||
size_t tmp = thrPerGrp;
|
||||
// Split the local workgroup into the most efficient way
|
||||
for (uint d = 0; d < workDim; ++d) {
|
||||
size_t div = tmp;
|
||||
for (; (gblWorkSize[d] % div) != 0; div--)
|
||||
;
|
||||
lclWorkSize[d] = div;
|
||||
tmp /= div;
|
||||
}
|
||||
lclWorkSize[0] = 16;
|
||||
lclWorkSize[1] = 16;
|
||||
}
|
||||
if (workDim == 3) {
|
||||
lclWorkSize[2] = 1;
|
||||
}
|
||||
}
|
||||
else {
|
||||
size_t tmp = thrPerGrp;
|
||||
// Split the local workgroup into the most efficient way
|
||||
for (uint d = 0; d < workDim; ++d) {
|
||||
size_t div = tmp;
|
||||
for (; (gblWorkSize[d] % div) != 0; div--)
|
||||
;
|
||||
lclWorkSize[d] = div;
|
||||
tmp /= div;
|
||||
}
|
||||
|
||||
if (!workGroupInfo()->uniformWorkGroupSize_) {
|
||||
// Assuming DWORD access
|
||||
const uint cacheLineMatch = device().info().globalMemCacheLineSize_ >> 2;
|
||||
if (!workGroupInfo()->uniformWorkGroupSize_) {
|
||||
// Assuming DWORD access
|
||||
const uint cacheLineMatch = device().info().globalMemCacheLineSize_ >> 2;
|
||||
|
||||
// Check if we couldn't find optimal workload
|
||||
if (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) ||
|
||||
// or size is too small for the cache line
|
||||
(lclWorkSize[0] < cacheLineMatch)) {
|
||||
size_t maxSize = 0;
|
||||
size_t maxDim = 0;
|
||||
// Check if we couldn't find optimal workload
|
||||
if (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) ||
|
||||
// or size is too small for the cache line
|
||||
(lclWorkSize[0] < cacheLineMatch)) {
|
||||
size_t maxSize = 0;
|
||||
size_t maxDim = 0;
|
||||
for (uint d = 0; d < workDim; ++d) {
|
||||
if (maxSize < gblWorkSize[d]) {
|
||||
maxSize = gblWorkSize[d];
|
||||
maxDim = d;
|
||||
}
|
||||
}
|
||||
// Use X dimension as high priority. Runtime will assume that
|
||||
// X dimension is more important for the address calculation
|
||||
if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) {
|
||||
lclWorkSize[0] = cacheLineMatch;
|
||||
thrPerGrp /= cacheLineMatch;
|
||||
lclWorkSize[maxDim] = thrPerGrp;
|
||||
for (uint d = 1; d < workDim; ++d) {
|
||||
if (d != maxDim) {
|
||||
lclWorkSize[d] = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Check if a local workgroup has the most optimal size
|
||||
if (thrPerGrp > maxSize) {
|
||||
thrPerGrp = maxSize;
|
||||
}
|
||||
lclWorkSize[maxDim] = thrPerGrp;
|
||||
for (uint d = 0; d < workDim; ++d) {
|
||||
if (maxSize < gblWorkSize[d]) {
|
||||
maxSize = gblWorkSize[d];
|
||||
maxDim = d;
|
||||
}
|
||||
}
|
||||
// Use X dimension as high priority. Runtime will assume that
|
||||
// X dimension is more important for the address calculation
|
||||
if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) {
|
||||
lclWorkSize[0] = cacheLineMatch;
|
||||
thrPerGrp /= cacheLineMatch;
|
||||
lclWorkSize[maxDim] = thrPerGrp;
|
||||
for (uint d = 1; d < workDim; ++d) {
|
||||
if (d != maxDim) {
|
||||
lclWorkSize[d] = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Check if a local workgroup has the most optimal size
|
||||
if (thrPerGrp > maxSize) {
|
||||
thrPerGrp = maxSize;
|
||||
}
|
||||
lclWorkSize[maxDim] = thrPerGrp;
|
||||
for (uint d = 0; d < workDim; ++d) {
|
||||
if (d != maxDim) {
|
||||
lclWorkSize[d] = 1;
|
||||
}
|
||||
if (d != maxDim) {
|
||||
lclWorkSize[d] = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Use overrides when app doesn't provide workgroup dimensions
|
||||
if (workDim == 1) {
|
||||
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE;
|
||||
}
|
||||
else if (workDim == 2) {
|
||||
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X;
|
||||
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y;
|
||||
}
|
||||
else if (workDim == 3) {
|
||||
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X;
|
||||
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y;
|
||||
lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z;
|
||||
}
|
||||
else {
|
||||
assert(0 && "Invalid workDim!");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
|
||||
Reference in New Issue
Block a user