SWDEV-368235 - Remove obsolete env variables

Change-Id: I7e14d53297e79e2f68b3a6cc40251ad7db9eb5ab


[ROCm/clr commit: 7b50c935f8]
This commit is contained in:
German
2023-02-03 13:44:24 -05:00
parent 2258de8acd
commit dfa7790030
12 changed files with 106 additions and 273 deletions
-11
View File
@@ -798,17 +798,6 @@ Settings::Settings() : value_(0) {
commandQueues_ = 200; //!< Field value set to maximum number
//!< concurrent Virtual GPUs for default
overrideLclSet = (!flagIsDefault(GPU_MAX_WORKGROUP_SIZE)) ? 1 : 0;
overrideLclSet |=
(!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) || !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y))
? 2
: 0;
overrideLclSet |=
(!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) || !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) ||
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z))
? 4
: 0;
fenceScopeAgent_ = AMD_OPT_FLUSH;
if (amd::IS_HIP) {
if (flagIsDefault(GPU_SINGLE_ALLOC_PERCENT)) {
+1 -2
View File
@@ -619,7 +619,6 @@ class Settings : public amd::HeapObject {
uint64_t extensions_; //!< Supported OCL extensions
union {
struct {
uint overrideLclSet : 3; //!< Bit mask to override the local size
uint apuSystem_ : 1; //!< Device is APU system with shared memory
uint supportRA_ : 1; //!< Support RA channel order format
uint waitCommand_ : 1; //!< Enables a wait for every submitted command
@@ -639,7 +638,7 @@ class Settings : public amd::HeapObject {
uint enableCoopMultiDeviceGroups_ : 1; //!< Enable cooperative groups multi device
uint fenceScopeAgent_ : 1; //!< Enable fence scope agent in AQL dispatch packet
uint rocr_backend_ : 1; //!< Device uses ROCr backend for submissions
uint reserved_ : 11;
uint reserved_ : 14;
};
uint value_;
};
+65 -85
View File
@@ -677,98 +677,78 @@ void Kernel::FindLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize,
if (workGroupInfo()->compileSize_[0] == 0) {
// Find the default local workgroup size, if it wasn't specified
if (lclWorkSize[0] == 0) {
if ((device().settings().overrideLclSet & (1 << (workDim - 1))) == 0) {
// Find threads per group
size_t thrPerGrp = workGroupInfo()->size_;
// Find threads per group
size_t thrPerGrp = workGroupInfo()->size_;
// Check if kernel uses images
if (flags_.imageEna_ &&
// and thread group is a multiple value of wavefronts
((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
// and it's 2 or 3-dimensional workload
(workDim > 1) && (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0))) {
// Use 8x8 workgroup size if kernel has image writes
if (flags_.imageWriteEna_ || (thrPerGrp != device().info().preferredWorkGroupSize_)) {
lclWorkSize[0] = 8;
lclWorkSize[1] = 8;
}
else {
lclWorkSize[0] = 16;
lclWorkSize[1] = 16;
}
if (workDim == 3) {
lclWorkSize[2] = 1;
}
// Check if kernel uses images
if (flags_.imageEna_ &&
// and thread group is a multiple value of wavefronts
((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
// and it's 2 or 3-dimensional workload
(workDim > 1) && (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0))) {
// Use 8x8 workgroup size if kernel has image writes
if (flags_.imageWriteEna_ || (thrPerGrp != device().info().preferredWorkGroupSize_)) {
lclWorkSize[0] = 8;
lclWorkSize[1] = 8;
}
else {
size_t tmp = thrPerGrp;
// Split the local workgroup into the most efficient way
for (uint d = 0; d < workDim; ++d) {
size_t div = tmp;
for (; (gblWorkSize[d] % div) != 0; div--)
;
lclWorkSize[d] = div;
tmp /= div;
}
// Assuming DWORD access
const uint cacheLineMatch = device().info().globalMemCacheLineSize_ >> 2;
// Check if we couldn't find optimal workload
if (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) ||
// or size is too small for the cache line
(lclWorkSize[0] < cacheLineMatch)) {
size_t maxSize = 0;
size_t maxDim = 0;
for (uint d = 0; d < workDim; ++d) {
if (maxSize < gblWorkSize[d]) {
maxSize = gblWorkSize[d];
maxDim = d;
}
}
// Use X dimension as high priority. Runtime will assume that
// X dimension is more important for the address calculation
if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) {
lclWorkSize[0] = cacheLineMatch;
thrPerGrp /= cacheLineMatch;
lclWorkSize[maxDim] = thrPerGrp;
for (uint d = 1; d < workDim; ++d) {
if (d != maxDim) {
lclWorkSize[d] = 1;
}
}
}
else {
// Check if a local workgroup has the most optimal size
if (thrPerGrp > maxSize) {
thrPerGrp = maxSize;
}
lclWorkSize[maxDim] = thrPerGrp;
for (uint d = 0; d < workDim; ++d) {
if (d != maxDim) {
lclWorkSize[d] = 1;
}
}
}
}
lclWorkSize[0] = 16;
lclWorkSize[1] = 16;
}
if (workDim == 3) {
lclWorkSize[2] = 1;
}
}
else {
// Use overrides when app doesn't provide workgroup dimensions
if (workDim == 1) {
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE;
size_t tmp = thrPerGrp;
// Split the local workgroup into the most efficient way
for (uint d = 0; d < workDim; ++d) {
size_t div = tmp;
for (; (gblWorkSize[d] % div) != 0; div--)
;
lclWorkSize[d] = div;
tmp /= div;
}
else if (workDim == 2) {
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X;
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y;
}
else if (workDim == 3) {
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X;
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y;
lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z;
}
else {
assert(0 && "Invalid workDim!");
// Assuming DWORD access
const uint cacheLineMatch = device().info().globalMemCacheLineSize_ >> 2;
// Check if we couldn't find optimal workload
if (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) ||
// or size is too small for the cache line
(lclWorkSize[0] < cacheLineMatch)) {
size_t maxSize = 0;
size_t maxDim = 0;
for (uint d = 0; d < workDim; ++d) {
if (maxSize < gblWorkSize[d]) {
maxSize = gblWorkSize[d];
maxDim = d;
}
}
// Use X dimension as high priority. Runtime will assume that
// X dimension is more important for the address calculation
if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) {
lclWorkSize[0] = cacheLineMatch;
thrPerGrp /= cacheLineMatch;
lclWorkSize[maxDim] = thrPerGrp;
for (uint d = 1; d < workDim; ++d) {
if (d != maxDim) {
lclWorkSize[d] = 1;
}
}
}
else {
// Check if a local workgroup has the most optimal size
if (thrPerGrp > maxSize) {
thrPerGrp = maxSize;
}
lclWorkSize[maxDim] = thrPerGrp;
for (uint d = 0; d < workDim; ++d) {
if (d != maxDim) {
lclWorkSize[d] = 1;
}
}
}
}
}
}
+2 -2
View File
@@ -291,7 +291,7 @@ bool NullDevice::create(const char* palName, const amd::Isa& isa, Pal::GfxIpLeve
nullptr,
nullptr,
nullptr,
AMD_OCL_SC_LIB};
nullptr};
// Initialize the compiler handle
acl_error error;
compiler_ = amd::Hsail::CompilerInit(&opts, &error);
@@ -1013,7 +1013,7 @@ bool Device::create(Pal::IDevice* device) {
nullptr,
nullptr,
nullptr,
AMD_OCL_SC_LIB};
nullptr};
// Initialize the compiler handle
acl_error error;
compiler_ = amd::Hsail::CompilerInit(&opts, &error);
+4 -35
View File
@@ -76,8 +76,6 @@ Settings::Settings() {
// Enable workload split by default (for 24 bit arithmetic or timeout)
workloadSplitSize_ = 1 << GPU_WORKLOAD_SPLIT;
// By default use host blit
blitEngine_ = BlitEngineHost;
pinnedXferSize_ = GPU_PINNED_MIN_XFER_SIZE * Mi;
pinnedMinXferSize_ = flagIsDefault(GPU_PINNED_MIN_XFER_SIZE)
? 128 * Mi : GPU_PINNED_MIN_XFER_SIZE * Mi;
@@ -123,8 +121,6 @@ Settings::Settings() {
//!@note IOL for Linux doesn't setup tiling aperture in CMM/QS
linearPersistentImage_ = false;
useSingleScratch_ = GPU_USE_SINGLE_SCRATCH;
// Device enqueuing settings
numDeviceEvents_ = 1024;
numWaitEvents_ = 8;
@@ -328,16 +324,11 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
libSelector_ = amd::GPU_Library_CI;
if (LP64_SWITCH(false, true)) {
oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/
? XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR))
: OpenCL12;
}
if (GPU_FORCE_OCL20_32BIT) {
force32BitOcl20_ = true;
oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/
oclVersion_ = !reportAsOCL12Device
? XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR))
: OpenCL12;
}
if (OPENCL_VERSION < 200) {
oclVersion_ = OpenCL12;
}
@@ -346,27 +337,13 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
// Cap at OpenCL20 for now
if (oclVersion_ > OpenCL20) oclVersion_ = OpenCL20;
// This needs to be cleaned once 64bit addressing is stable
if (oclVersion_ < OpenCL20) {
use64BitPtr_ = flagIsDefault(GPU_FORCE_64BIT_PTR)
? LP64_SWITCH(false,
/*calAttr.isWorkstation ||*/ true)
: GPU_FORCE_64BIT_PTR;
} else {
if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, true)) {
use64BitPtr_ = true;
}
}
use64BitPtr_ = LP64_SWITCH(false, true);
if (oclVersion_ >= OpenCL20) {
supportDepthsRGB_ = true;
}
if (use64BitPtr_) {
if (GPU_ENABLE_LARGE_ALLOCATION) {
maxAllocSize_ = 64ULL * Gi;
} else {
maxAllocSize_ = 4048 * Mi;
}
maxAllocSize_ = 64ULL * Gi;
} else {
maxAllocSize_ = 3ULL * Gi;
}
@@ -447,9 +424,6 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
imageSupport_ = true;
// Use kernels for blit if appropriate
blitEngine_ = BlitEngineKernel;
hostMemDirectAccess_ |= HostMemBuffer;
// HW doesn't support untiled image writes
// hostMemDirectAccess_ |= HostMemImage;
@@ -542,11 +516,6 @@ void Settings::override() {
preferredWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE;
}
// Override blit engine type
if (GPU_BLIT_ENGINE_TYPE != BlitEngineDefault) {
blitEngine_ = GPU_BLIT_ENGINE_TYPE;
}
if (!flagIsDefault(DEBUG_GPU_FLAGS)) {
debugFlags_ = DEBUG_GPU_FLAGS;
}
@@ -70,7 +70,6 @@ class Settings : public device::Settings {
uint gfx10Plus_ : 1; //!< gfx10 and post gfx10 features
uint threadTraceEnable_ : 1; //!< Thread trace enable
uint linearPersistentImage_ : 1; //!< Allocates linear images in persistent
uint useSingleScratch_ : 1; //!< Allocates single scratch per device
uint svmAtomics_ : 1; //!< SVM device atomics
uint svmFineGrainSystem_ : 1; //!< SVM fine grain system support
uint useDeviceQueue_ : 1; //!< Submit to separate device queue
@@ -82,7 +81,7 @@ class Settings : public device::Settings {
uint imageBufferWar_ : 1; //!< Image buffer workaround for Gfx10
uint disableSdma_ : 1; //!< Disable SDMA support
uint alwaysResident_ : 1; //!< Make resources resident at allocation time
uint reserved_ : 7;
uint reserved_ : 8;
};
uint value_;
};
@@ -95,7 +94,6 @@ class Settings : public device::Settings {
uint workloadSplitSize_; //!< Workload split size
uint minWorkloadTime_; //!< Minimal workload time in 0.1 ms
uint maxWorkloadTime_; //!< Maximum workload time in 0.1 ms
uint blitEngine_; //!< Blit engine type
uint cacheLineSize_; //!< Cache line size in bytes
uint cacheSize_; //!< L1 cache size in bytes
uint numComputeRings_; //!< 0 - disabled, 1 , 2,.. - the number of compute rings
+3 -22
View File
@@ -905,11 +905,6 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
// because destructor calls eraseResourceList() even if create() failed
dev().resizeResoureList(index());
if (index() >= GPU_MAX_COMMAND_QUEUES) {
// Cap the maximum number of concurrent Virtual GPUs
return false;
}
// Virtual GPU will have profiling enabled
state_.profiling_ = profiling;
@@ -1020,18 +1015,7 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
return false;
}
// Choose the appropriate class for blit engine
switch (dev().settings().blitEngine_) {
default:
// Fall through ...
case Settings::BlitEngineHost:
blitSetup.disableAll();
// Fall through ...
case Settings::BlitEngineCAL:
case Settings::BlitEngineKernel:
blitMgr_ = new KernelBlitManager(*this, blitSetup);
break;
}
blitMgr_ = new KernelBlitManager(*this, blitSetup);
if ((nullptr == blitMgr_) || !blitMgr_->create(gpuDevice_)) {
LogError("Could not create BlitManager!");
return false;
@@ -3269,11 +3253,8 @@ void VirtualGPU::waitEventLock(CommandBatch* cb) {
cb->lastTS_->value(&startTimeStampGPU, &endTimeStampGPU);
uint64_t endTimeStampCPU = amd::Os::timeNanos();
// Make sure the command batch has a valid GPU TS
if (!GPU_RAW_TIMESTAMP) {
// Adjust the base time by the execution time
readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU;
}
// Adjust the base time by the execution time
readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU;
}
}
}
+28 -30
View File
@@ -894,37 +894,35 @@ hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, vo
Device* dev = reinterpret_cast<Device*>(data);
switch (segment_type) {
case HSA_REGION_SEGMENT_GLOBAL: {
if (dev->settings().enableLocalMemory_) {
uint32_t global_flag = 0;
hsa_status_t stat =
hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag);
if (stat != HSA_STATUS_SUCCESS) {
return stat;
uint32_t global_flag = 0;
hsa_status_t stat =
hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag);
if (stat != HSA_STATUS_SUCCESS) {
return stat;
}
if ((global_flag & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) != 0) {
dev->gpu_fine_grained_segment_ = pool;
} else if ((global_flag & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) != 0) {
dev->gpuvm_segment_ = pool;
// If cpu agent cannot access this pool, the device does not support large bar.
hsa_amd_memory_pool_access_t tmp{};
hsa_amd_agent_memory_pool_get_info(
dev->cpu_agent_,
pool,
HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
&tmp);
if (tmp == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) {
dev->info_.largeBar_ = false;
} else {
dev->info_.largeBar_ = ROC_ENABLE_LARGE_BAR;
}
}
if ((global_flag & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) != 0) {
dev->gpu_fine_grained_segment_ = pool;
} else if ((global_flag & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) != 0) {
dev->gpuvm_segment_ = pool;
// If cpu agent cannot access this pool, the device does not support large bar.
hsa_amd_memory_pool_access_t tmp{};
hsa_amd_agent_memory_pool_get_info(
dev->cpu_agent_,
pool,
HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
&tmp);
if (tmp == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) {
dev->info_.largeBar_ = false;
} else {
dev->info_.largeBar_ = ROC_ENABLE_LARGE_BAR;
}
}
if (dev->gpuvm_segment_.handle == 0) {
dev->gpuvm_segment_ = pool;
}
if (dev->gpuvm_segment_.handle == 0) {
dev->gpuvm_segment_ = pool;
}
break;
}
@@ -1232,7 +1230,7 @@ bool Device::populateOCLDeviceConstants() {
info_.maxWorkItemDimensions_ = 3;
if (settings().enableLocalMemory_ && gpuvm_segment_.handle != 0) {
if (gpuvm_segment_.handle != 0) {
size_t global_segment_size = 0;
if (HSA_STATUS_SUCCESS != hsa_amd_memory_pool_get_info(gpuvm_segment_,
HSA_AMD_MEMORY_POOL_INFO_SIZE,
@@ -218,7 +218,7 @@ class NullDevice : public amd::Device {
//! Determine if we can use device memory for SVM
const bool forceFineGrain(amd::Memory* memory) const {
return !settings().enableCoarseGrainSVM_ || (memory->getContext().devices().size() > 1);
return (memory->getContext().devices().size() > 1);
}
virtual bool importExtSemaphore(void** extSemahore, const amd::Os::FileDesc& handle) {
@@ -35,18 +35,9 @@ Settings::Settings() {
// Set this to true when we drop the flag
doublePrecision_ = ::CL_KHR_FP64;
enableLocalMemory_ = HSA_LOCAL_MEMORY_ENABLE;
enableCoarseGrainSVM_ = HSA_ENABLE_COARSE_GRAIN_SVM;
maxWorkGroupSize_ = 1024;
preferredWorkGroupSize_ = 256;
maxWorkGroupSize2DX_ = 16;
maxWorkGroupSize2DY_ = 16;
maxWorkGroupSize3DX_ = 4;
maxWorkGroupSize3DY_ = 4;
maxWorkGroupSize3DZ_ = 4;
kernargPoolSize_ = HSA_KERNARG_POOL_SIZE;
// Determine if user is requesting Non-Coherent mode
@@ -201,23 +192,6 @@ void Settings::override() {
preferredWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE;
}
if (GPU_MAX_WORKGROUP_SIZE_2D_X != 0) {
maxWorkGroupSize2DX_ = GPU_MAX_WORKGROUP_SIZE_2D_X;
}
if (GPU_MAX_WORKGROUP_SIZE_2D_Y != 0) {
maxWorkGroupSize2DY_ = GPU_MAX_WORKGROUP_SIZE_2D_Y;
}
if (GPU_MAX_WORKGROUP_SIZE_3D_X != 0) {
maxWorkGroupSize3DX_ = GPU_MAX_WORKGROUP_SIZE_3D_X;
}
if (GPU_MAX_WORKGROUP_SIZE_3D_Y != 0) {
maxWorkGroupSize3DY_ = GPU_MAX_WORKGROUP_SIZE_3D_Y;
}
if (GPU_MAX_WORKGROUP_SIZE_3D_Z != 0) {
maxWorkGroupSize3DZ_ = GPU_MAX_WORKGROUP_SIZE_3D_Z;
}
if (!flagIsDefault(GPU_XFER_BUFFER_SIZE)) {
xferBufSize_ = GPU_XFER_BUFFER_SIZE * Ki;
}
@@ -42,8 +42,6 @@ class Settings : public device::Settings {
union {
struct {
uint doublePrecision_ : 1; //!< Enables double precision support
uint enableLocalMemory_ : 1; //!< Enable GPUVM memory
uint enableCoarseGrainSVM_ : 1; //!< Enable device memory for coarse grain SVM allocations
uint enableNCMode_ : 1; //!< Enable Non Coherent mode for system memory
uint imageDMA_ : 1; //!< Enable direct image DMA transfers
uint stagedXferRead_ : 1; //!< Uses a staged buffer read
@@ -55,7 +53,7 @@ class Settings : public device::Settings {
uint fgs_kernel_arg_ : 1; //!< Use fine grain kernel arg segment
uint coop_sync_ : 1; //!< grid and multi-grid sync for gfx940+
uint barrier_value_packet_ : 1; //!< Barrier value packet functionality
uint reserved_ : 18;
uint reserved_ : 20;
};
uint value_;
};
@@ -66,15 +64,6 @@ class Settings : public device::Settings {
//! Preferred workgroup size
uint preferredWorkGroupSize_;
//! Default max workgroup sizes for 2D
int maxWorkGroupSize2DX_;
int maxWorkGroupSize2DY_;
//! Default max workgroup sizes for 3D
int maxWorkGroupSize3DX_;
int maxWorkGroupSize3DY_;
int maxWorkGroupSize3DZ_;
uint kernargPoolSize_;
uint numDeviceEvents_; //!< The number of device events
uint numWaitEvents_; //!< The number of wait events for device enqueue
-44
View File
@@ -30,22 +30,10 @@ release(uint, AMD_LOG_MASK, 0X7FFFFFFF, \
"The mask to enable specific kinds of logs") \
debug(uint, DEBUG_GPU_FLAGS, 0, \
"The debug options for GPU device") \
release(uint, GPU_MAX_COMMAND_QUEUES, 300, \
"The maximum number of concurrent Virtual GPUs") \
release(size_t, CQ_THREAD_STACK_SIZE, 256*Ki, /* @todo: that much! */ \
"The default command queue thread stack size") \
release(int, GPU_MAX_WORKGROUP_SIZE, 0, \
"Maximum number of workitems in a workgroup for GPU, 0 -use default") \
release(int, GPU_MAX_WORKGROUP_SIZE_2D_X, 0, \
"Maximum number of workitems in a 2D workgroup for GPU, x component, 0 -use default") \
release(int, GPU_MAX_WORKGROUP_SIZE_2D_Y, 0, \
"Maximum number of workitems in a 2D workgroup for GPU, y component, 0 -use default") \
release(int, GPU_MAX_WORKGROUP_SIZE_3D_X, 0, \
"Maximum number of workitems in a 3D workgroup for GPU, x component, 0 -use default") \
release(int, GPU_MAX_WORKGROUP_SIZE_3D_Y, 0, \
"Maximum number of workitems in a 3D workgroup for GPU, y component, 0 -use default") \
release(int, GPU_MAX_WORKGROUP_SIZE_3D_Z, 0, \
"Maximum number of workitems in a 3D workgroup for GPU, z component, 0 -use default") \
debug(bool, CPU_MEMORY_GUARD_PAGES, false, \
"Use guard pages for CPU memory") \
debug(size_t, CPU_MEMORY_GUARD_PAGE_SIZE, 64, \
@@ -70,12 +58,8 @@ release(uint, GPU_STAGING_BUFFER_SIZE, 4, \
"Size of the GPU staging buffer in MiB") \
release(bool, GPU_DUMP_BLIT_KERNELS, false, \
"Dump the kernels for blit manager") \
release(uint, GPU_BLIT_ENGINE_TYPE, 0x0, \
"Blit engine type: 0 - Default, 1 - Host, 2 - CAL, 3 - Kernel") \
release(bool, GPU_FLUSH_ON_EXECUTION, false, \
"Submit commands to HW on every operation. 0 - Disable, 1 - Enable") \
release(bool, GPU_USE_SYNC_OBJECTS, true, \
"If enabled, use sync objects instead of polling") \
release(bool, CL_KHR_FP64, true, \
"Enable/Disable support for double precision") \
release(cstring, AMD_OCL_BUILD_OPTIONS, 0, \
@@ -86,12 +70,8 @@ release(cstring, AMD_OCL_LINK_OPTIONS, 0, \
"Set clLinkProgram()'s options (override)") \
release(cstring, AMD_OCL_LINK_OPTIONS_APPEND, 0, \
"Append clLinkProgram()'s options") \
release(cstring, AMD_OCL_SC_LIB, 0, \
"Set shader compiler shared library name or path") \
debug(cstring, AMD_OCL_SUBST_OBJFILE, 0, \
"Specify binary substitution config file for OpenCL") \
debug(bool, AMD_OCL_ENABLE_MESSAGE_BOX, false, \
"Enable the error dialog on Windows") \
release(size_t, GPU_PINNED_XFER_SIZE, 32, \
"The pinned buffer size for pinning in read/write transfers in MiB") \
release(size_t, GPU_PINNED_MIN_XFER_SIZE, 128, \
@@ -100,12 +80,6 @@ release(size_t, GPU_RESOURCE_CACHE_SIZE, 64, \
"The resource cache size in MB") \
release(size_t, GPU_MAX_SUBALLOC_SIZE, 4096, \
"The maximum size accepted for suballocaitons in KB") \
release(bool, GPU_FORCE_64BIT_PTR, 0, \
"Forces 64 bit pointers on GPU") \
release(bool, GPU_FORCE_OCL20_32BIT, 0, \
"Forces 32 bit apps to take CLANG\HSAIL path") \
release(bool, GPU_RAW_TIMESTAMP, 0, \
"Reports GPU raw timestamps in GPU timeline") \
release(size_t, GPU_NUM_MEM_DEPENDENCY, 256, \
"Number of memory objects for dependency tracking") \
release(size_t, GPU_XFER_BUFFER_SIZE, 0, \
@@ -116,32 +90,20 @@ release(uint, GPU_SINGLE_ALLOC_PERCENT, 85, \
"Maximum size of a single allocation as percentage of total") \
release(uint, GPU_NUM_COMPUTE_RINGS, 2, \
"GPU number of compute rings. 0 - disabled, 1 , 2,.. - the number of compute rings") \
release(int, GPU_SELECT_COMPUTE_RINGS_ID, -1, \
"GPU select the compute rings ID -1 - disabled, 0 , 1,.. - the forced compute rings ID for submission") \
release(uint, GPU_WORKLOAD_SPLIT, 22, \
"Workload split size") \
release(bool, GPU_USE_SINGLE_SCRATCH, false, \
"Use single scratch buffer per device instead of per HW ring") \
release(bool, AMD_OCL_WAIT_COMMAND, false, \
"1 = Enable a wait for every submitted command") \
release(uint, GPU_PRINT_CHILD_KERNEL, 0, \
"Prints the specified number of the child kernels") \
release(bool, GPU_USE_DEVICE_QUEUE, false, \
"Use a dedicated device queue for the actual submissions") \
release(bool, GPU_ENABLE_LARGE_ALLOCATION, true, \
"Enable >4GB single allocations") \
release(bool, AMD_THREAD_TRACE_ENABLE, true, \
"Enable thread trace extension") \
release(uint, OPENCL_VERSION, (IS_BRAHMA ? 120 : 200), \
"Force GPU opencl verison") \
release(bool, HSA_LOCAL_MEMORY_ENABLE, true, \
"Enable HSA device local memory usage") \
release(uint, HSA_KERNARG_POOL_SIZE, 1024 * 1024, \
"Kernarg pool size") \
release(bool, HSA_ENABLE_COARSE_GRAIN_SVM, true, \
"Enable device memory for coarse grain SVM allocations") \
release(bool, GPU_IFH_MODE, false, \
"1 = Enable GPU IFH (infinitely fast hardware) mode. Any other value keeps setting disabled.") \
release(bool, GPU_MIPMAP, true, \
"Enables GPU mipmap extension") \
release(uint, GPU_ENABLE_PAL, 2, \
@@ -152,8 +114,6 @@ release(int, AMD_GPU_FORCE_SINGLE_FP_DENORM, -1, \
"Force denorm for single precision: -1 - don't force, 0 - disable, 1 - enable") \
release(uint, OCL_SET_SVM_SIZE, 4*16384, \
"set SVM space size for discrete GPU") \
debug(uint, OCL_SYSMEM_REQUIREMENT, 2, \
"Use flag to change the minimum requirement of system memory not to downgrade") \
release(uint, GPU_WAVES_PER_SIMD, 0, \
"Force the number of waves per SIMD (1-10)") \
release(bool, GPU_WAVE_LIMIT_ENABLE, false, \
@@ -176,10 +136,6 @@ release_on_stg(cstring, GPU_WAVE_LIMIT_DUMP, "", \
"File path prefix for dumping wave limiter output") \
release_on_stg(cstring, GPU_WAVE_LIMIT_TRACE, "", \
"File path prefix for tracing wave limiter") \
release(bool, OCL_CODE_CACHE_ENABLE, false, \
"1 = Enable compiler code cache") \
release(bool, OCL_CODE_CACHE_RESET, false, \
"1 = Reset the compiler code cache storage") \
release(bool, PAL_DISABLE_SDMA, false, \
"1 = Disable SDMA for PAL") \
release(uint, PAL_RGP_DISP_COUNT, 10000, \