SWDEV-407533 - [ABI Break]Purge unused env vars
Change-Id: I627950e8ebb6299affc602754a20d442dbe42b14
Este commit está contenido en:
@@ -933,20 +933,6 @@ Settings::Settings() : value_(0) {
|
||||
customHostAllocator_ = false;
|
||||
waitCommand_ = AMD_OCL_WAIT_COMMAND;
|
||||
supportDepthsRGB_ = false;
|
||||
commandQueues_ = 200; //!< Field value set to maximum number
|
||||
//!< concurrent Virtual GPUs for default
|
||||
|
||||
overrideLclSet = (!flagIsDefault(GPU_MAX_WORKGROUP_SIZE)) ? 1 : 0;
|
||||
overrideLclSet |=
|
||||
(!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) || !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y))
|
||||
? 2
|
||||
: 0;
|
||||
overrideLclSet |=
|
||||
(!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) || !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) ||
|
||||
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z))
|
||||
? 4
|
||||
: 0;
|
||||
|
||||
fenceScopeAgent_ = AMD_OPT_FLUSH;
|
||||
if (amd::IS_HIP) {
|
||||
if (flagIsDefault(GPU_SINGLE_ALLOC_PERCENT)) {
|
||||
|
||||
@@ -640,7 +640,6 @@ class Settings : public amd::HeapObject {
|
||||
uint64_t extensions_; //!< Supported OCL extensions
|
||||
union {
|
||||
struct {
|
||||
uint overrideLclSet : 3; //!< Bit mask to override the local size
|
||||
uint apuSystem_ : 1; //!< Device is APU system with shared memory
|
||||
uint supportRA_ : 1; //!< Support RA channel order format
|
||||
uint waitCommand_ : 1; //!< Enables a wait for every submitted command
|
||||
@@ -660,14 +659,11 @@ class Settings : public amd::HeapObject {
|
||||
uint enableCoopMultiDeviceGroups_ : 1; //!< Enable cooperative groups multi device
|
||||
uint fenceScopeAgent_ : 1; //!< Enable fence scope agent in AQL dispatch packet
|
||||
uint rocr_backend_ : 1; //!< Device uses ROCr backend for submissions
|
||||
uint reserved_ : 11;
|
||||
uint reserved_ : 14;
|
||||
};
|
||||
uint value_;
|
||||
};
|
||||
|
||||
uint commandQueues_; //!< Field value for maximum number
|
||||
//!< concurrent Virtual GPUs for each backend
|
||||
|
||||
//! Default constructor
|
||||
Settings();
|
||||
|
||||
@@ -1383,7 +1379,7 @@ class Isa {
|
||||
|
||||
/// @returns If the ROCm runtime supports the ISA.
|
||||
bool runtimeRocSupported() const {
|
||||
if (!IS_HIP && !ROC_ENABLE_PRE_VEGA && (versionMajor_ == 8)) {
|
||||
if (!IS_HIP && (versionMajor_ == 8)) {
|
||||
return false;
|
||||
}
|
||||
return runtimeRocSupported_;
|
||||
|
||||
@@ -684,102 +684,82 @@ void Kernel::FindLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize,
|
||||
if (workGroupInfo()->compileSize_[0] == 0) {
|
||||
// Find the default local workgroup size, if it wasn't specified
|
||||
if (lclWorkSize[0] == 0) {
|
||||
if ((device().settings().overrideLclSet & (1 << (workDim - 1))) == 0) {
|
||||
// Find threads per group
|
||||
size_t thrPerGrp = workGroupInfo()->size_;
|
||||
// Find threads per group
|
||||
size_t thrPerGrp = workGroupInfo()->size_;
|
||||
|
||||
// Check if kernel uses images
|
||||
if (flags_.imageEna_ &&
|
||||
// and thread group is a multiple value of wavefronts
|
||||
((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
|
||||
// and it's 2 or 3-dimensional workload
|
||||
(workDim > 1) && (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0))) {
|
||||
// Use 8x8 workgroup size if kernel has image writes
|
||||
if (flags_.imageWriteEna_ || (thrPerGrp != device().info().preferredWorkGroupSize_)) {
|
||||
lclWorkSize[0] = 8;
|
||||
lclWorkSize[1] = 8;
|
||||
}
|
||||
else {
|
||||
lclWorkSize[0] = 16;
|
||||
lclWorkSize[1] = 16;
|
||||
}
|
||||
if (workDim == 3) {
|
||||
lclWorkSize[2] = 1;
|
||||
}
|
||||
// Check if kernel uses images
|
||||
if (flags_.imageEna_ &&
|
||||
// and thread group is a multiple value of wavefronts
|
||||
((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
|
||||
// and it's 2 or 3-dimensional workload
|
||||
(workDim > 1) && (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0))) {
|
||||
// Use 8x8 workgroup size if kernel has image writes
|
||||
if (flags_.imageWriteEna_ || (thrPerGrp != device().info().preferredWorkGroupSize_)) {
|
||||
lclWorkSize[0] = 8;
|
||||
lclWorkSize[1] = 8;
|
||||
}
|
||||
else {
|
||||
size_t tmp = thrPerGrp;
|
||||
// Split the local workgroup into the most efficient way
|
||||
for (uint d = 0; d < workDim; ++d) {
|
||||
size_t div = tmp;
|
||||
for (; (gblWorkSize[d] % div) != 0; div--)
|
||||
;
|
||||
lclWorkSize[d] = div;
|
||||
tmp /= div;
|
||||
}
|
||||
lclWorkSize[0] = 16;
|
||||
lclWorkSize[1] = 16;
|
||||
}
|
||||
if (workDim == 3) {
|
||||
lclWorkSize[2] = 1;
|
||||
}
|
||||
}
|
||||
else {
|
||||
size_t tmp = thrPerGrp;
|
||||
// Split the local workgroup into the most efficient way
|
||||
for (uint d = 0; d < workDim; ++d) {
|
||||
size_t div = tmp;
|
||||
for (; (gblWorkSize[d] % div) != 0; div--)
|
||||
;
|
||||
lclWorkSize[d] = div;
|
||||
tmp /= div;
|
||||
}
|
||||
|
||||
if (!workGroupInfo()->uniformWorkGroupSize_) {
|
||||
// Assuming DWORD access
|
||||
const uint cacheLineMatch = device().info().globalMemCacheLineSize_ >> 2;
|
||||
if (!workGroupInfo()->uniformWorkGroupSize_) {
|
||||
// Assuming DWORD access
|
||||
const uint cacheLineMatch = device().info().globalMemCacheLineSize_ >> 2;
|
||||
|
||||
// Check if we couldn't find optimal workload
|
||||
if (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) ||
|
||||
// or size is too small for the cache line
|
||||
(lclWorkSize[0] < cacheLineMatch)) {
|
||||
size_t maxSize = 0;
|
||||
size_t maxDim = 0;
|
||||
// Check if we couldn't find optimal workload
|
||||
if (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) ||
|
||||
// or size is too small for the cache line
|
||||
(lclWorkSize[0] < cacheLineMatch)) {
|
||||
size_t maxSize = 0;
|
||||
size_t maxDim = 0;
|
||||
for (uint d = 0; d < workDim; ++d) {
|
||||
if (maxSize < gblWorkSize[d]) {
|
||||
maxSize = gblWorkSize[d];
|
||||
maxDim = d;
|
||||
}
|
||||
}
|
||||
// Use X dimension as high priority. Runtime will assume that
|
||||
// X dimension is more important for the address calculation
|
||||
if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) {
|
||||
lclWorkSize[0] = cacheLineMatch;
|
||||
thrPerGrp /= cacheLineMatch;
|
||||
lclWorkSize[maxDim] = thrPerGrp;
|
||||
for (uint d = 1; d < workDim; ++d) {
|
||||
if (d != maxDim) {
|
||||
lclWorkSize[d] = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Check if a local workgroup has the most optimal size
|
||||
if (thrPerGrp > maxSize) {
|
||||
thrPerGrp = maxSize;
|
||||
}
|
||||
lclWorkSize[maxDim] = thrPerGrp;
|
||||
for (uint d = 0; d < workDim; ++d) {
|
||||
if (maxSize < gblWorkSize[d]) {
|
||||
maxSize = gblWorkSize[d];
|
||||
maxDim = d;
|
||||
}
|
||||
}
|
||||
// Use X dimension as high priority. Runtime will assume that
|
||||
// X dimension is more important for the address calculation
|
||||
if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) {
|
||||
lclWorkSize[0] = cacheLineMatch;
|
||||
thrPerGrp /= cacheLineMatch;
|
||||
lclWorkSize[maxDim] = thrPerGrp;
|
||||
for (uint d = 1; d < workDim; ++d) {
|
||||
if (d != maxDim) {
|
||||
lclWorkSize[d] = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Check if a local workgroup has the most optimal size
|
||||
if (thrPerGrp > maxSize) {
|
||||
thrPerGrp = maxSize;
|
||||
}
|
||||
lclWorkSize[maxDim] = thrPerGrp;
|
||||
for (uint d = 0; d < workDim; ++d) {
|
||||
if (d != maxDim) {
|
||||
lclWorkSize[d] = 1;
|
||||
}
|
||||
if (d != maxDim) {
|
||||
lclWorkSize[d] = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Use overrides when app doesn't provide workgroup dimensions
|
||||
if (workDim == 1) {
|
||||
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE;
|
||||
}
|
||||
else if (workDim == 2) {
|
||||
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X;
|
||||
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y;
|
||||
}
|
||||
else if (workDim == 3) {
|
||||
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X;
|
||||
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y;
|
||||
lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z;
|
||||
}
|
||||
else {
|
||||
assert(0 && "Invalid workDim!");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
|
||||
@@ -300,7 +300,7 @@ bool NullDevice::create(const char* palName, const amd::Isa& isa, Pal::GfxIpLeve
|
||||
nullptr,
|
||||
nullptr,
|
||||
nullptr,
|
||||
AMD_OCL_SC_LIB};
|
||||
nullptr};
|
||||
// Initialize the compiler handle
|
||||
acl_error error;
|
||||
compiler_ = amd::Hsail::CompilerInit(&opts, &error);
|
||||
@@ -1029,7 +1029,7 @@ bool Device::create(Pal::IDevice* device) {
|
||||
nullptr,
|
||||
nullptr,
|
||||
nullptr,
|
||||
AMD_OCL_SC_LIB};
|
||||
nullptr};
|
||||
// Initialize the compiler handle
|
||||
acl_error error;
|
||||
compiler_ = amd::Hsail::CompilerInit(&opts, &error);
|
||||
|
||||
@@ -33,19 +33,6 @@
|
||||
|
||||
namespace pal {
|
||||
|
||||
/*! \brief information for adjusting maximum workload time
|
||||
*
|
||||
* This structure contains the time and OS minor version for max workload time
|
||||
* adjustment for Windows 7 or 8.
|
||||
*/
|
||||
struct ModifyMaxWorkload {
|
||||
uint32_t time; //!< max work load time (10x ms)
|
||||
uint32_t minorVersion; //!< OS minor version
|
||||
#if defined(_WIN32)
|
||||
BYTE comparisonOps; //!< Comparison option
|
||||
#endif
|
||||
};
|
||||
|
||||
Settings::Settings() {
|
||||
// Initialize the GPU device default settings
|
||||
oclVersion_ = OpenCL12;
|
||||
@@ -73,9 +60,6 @@ Settings::Settings() {
|
||||
|
||||
libSelector_ = amd::LibraryUndefined;
|
||||
|
||||
// Enable workload split by default (for 24 bit arithmetic or timeout)
|
||||
workloadSplitSize_ = 1 << GPU_WORKLOAD_SPLIT;
|
||||
|
||||
// By default use host blit
|
||||
blitEngine_ = BlitEngineHost;
|
||||
pinnedXferSize_ = GPU_PINNED_XFER_SIZE * Mi;
|
||||
@@ -117,15 +101,10 @@ Settings::Settings() {
|
||||
// Number of compute rings.
|
||||
numComputeRings_ = 0;
|
||||
|
||||
minWorkloadTime_ = 1; // 0.1 ms
|
||||
maxWorkloadTime_ = 500000; // 500 ms
|
||||
|
||||
// Controls tiled images in persistent
|
||||
//!@note IOL for Linux doesn't setup tiling aperture in CMM/QS
|
||||
linearPersistentImage_ = false;
|
||||
|
||||
useSingleScratch_ = GPU_USE_SINGLE_SCRATCH;
|
||||
|
||||
// Device enqueuing settings
|
||||
numDeviceEvents_ = 1024;
|
||||
numWaitEvents_ = 8;
|
||||
@@ -177,14 +156,6 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
|
||||
|
||||
// Disable thread trace by default for all devices
|
||||
threadTraceEnable_ = false;
|
||||
bool doublePrecision = true;
|
||||
|
||||
// Update GPU specific settings and info structure if we have any
|
||||
#if defined(_WIN32)
|
||||
ModifyMaxWorkload modifyMaxWorkload = {0, 1, VER_EQUAL};
|
||||
#else
|
||||
ModifyMaxWorkload modifyMaxWorkload = {0};
|
||||
#endif
|
||||
|
||||
// APU systems
|
||||
if (palProp.gpuType == Pal::GpuType::Integrated) {
|
||||
@@ -250,14 +221,6 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
|
||||
// GFX10.1 HW doesn't support custom pitch. Enable double copy workaround
|
||||
imageBufferWar_ = GPU_IMAGE_BUFFER_WAR;
|
||||
}
|
||||
if (false) {
|
||||
// UnknownDevice0 HW doesn't have SDMA engine
|
||||
disableSdma_ = true;
|
||||
// And LDS is limited to 32KB
|
||||
hwLDSSize_ = 32 * Ki;
|
||||
// No fp64 support
|
||||
doublePrecision = false;
|
||||
}
|
||||
// Fall through to AI (gfx9) ...
|
||||
case Pal::AsicRevision::Vega20:
|
||||
// Enable HW P2P path for Vega20+. Runtime still relies on KMD/PAL for support
|
||||
@@ -277,15 +240,6 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
|
||||
case Pal::AsicRevision::Carrizo:
|
||||
case Pal::AsicRevision::Bristol:
|
||||
case Pal::AsicRevision::Stoney:
|
||||
if (!aiPlus_) {
|
||||
// Fix BSOD/TDR issues observed on Stoney Win7/8.1/10
|
||||
minWorkloadTime_ = 1000;
|
||||
modifyMaxWorkload.time = 1000; // Decided by experiment
|
||||
modifyMaxWorkload.minorVersion = 1; // Win 7
|
||||
#if defined(_WIN32)
|
||||
modifyMaxWorkload.comparisonOps = VER_EQUAL; // Limit to Win 7 only
|
||||
#endif
|
||||
}
|
||||
case Pal::AsicRevision::Iceland:
|
||||
case Pal::AsicRevision::Tonga:
|
||||
case Pal::AsicRevision::Fiji:
|
||||
@@ -307,15 +261,6 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
|
||||
case Pal::AsicRevision::Godavari:
|
||||
case Pal::AsicRevision::Spectre:
|
||||
case Pal::AsicRevision::Spooky:
|
||||
if (!viPlus_) {
|
||||
// Fix BSOD/TDR issues observed on Kaveri Win7 (EPR#416903)
|
||||
modifyMaxWorkload.time = 250000; // 250ms
|
||||
modifyMaxWorkload.minorVersion = 1; // Win 7
|
||||
#if defined(_WIN32)
|
||||
modifyMaxWorkload.comparisonOps = VER_EQUAL; // limit to Win 7
|
||||
#endif
|
||||
}
|
||||
// Fall through ...
|
||||
case Pal::AsicRevision::Bonaire:
|
||||
case Pal::AsicRevision::Hawaii:
|
||||
case Pal::AsicRevision::HawaiiPro:
|
||||
@@ -331,13 +276,7 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
|
||||
|
||||
libSelector_ = amd::GPU_Library_CI;
|
||||
if (LP64_SWITCH(false, true)) {
|
||||
oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/
|
||||
? XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR))
|
||||
: OpenCL12;
|
||||
}
|
||||
if (GPU_FORCE_OCL20_32BIT) {
|
||||
force32BitOcl20_ = true;
|
||||
oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/
|
||||
oclVersion_ = !reportAsOCL12Device
|
||||
? XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR))
|
||||
: OpenCL12;
|
||||
}
|
||||
@@ -348,28 +287,14 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
|
||||
|
||||
// Cap at OpenCL20 for now
|
||||
if (oclVersion_ > OpenCL20) oclVersion_ = OpenCL20;
|
||||
|
||||
// This needs to be cleaned once 64bit addressing is stable
|
||||
if (oclVersion_ < OpenCL20) {
|
||||
use64BitPtr_ = flagIsDefault(GPU_FORCE_64BIT_PTR)
|
||||
? LP64_SWITCH(false,
|
||||
/*calAttr.isWorkstation ||*/ true)
|
||||
: GPU_FORCE_64BIT_PTR;
|
||||
} else {
|
||||
if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, true)) {
|
||||
use64BitPtr_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
use64BitPtr_ = LP64_SWITCH(false, true);
|
||||
|
||||
if (oclVersion_ >= OpenCL20) {
|
||||
supportDepthsRGB_ = true;
|
||||
}
|
||||
if (use64BitPtr_) {
|
||||
if (GPU_ENABLE_LARGE_ALLOCATION) {
|
||||
maxAllocSize_ = 64ULL * Gi;
|
||||
} else {
|
||||
maxAllocSize_ = 4048 * Mi;
|
||||
}
|
||||
maxAllocSize_ = 64ULL * Gi;
|
||||
} else {
|
||||
maxAllocSize_ = 3ULL * Gi;
|
||||
}
|
||||
@@ -395,26 +320,6 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
|
||||
// Image DMA must be disabled if SDMA is disabled
|
||||
imageDMA_ &= !disableSdma_;
|
||||
|
||||
splitSizeForWin7_ = false;
|
||||
|
||||
#if defined(_WIN32)
|
||||
OSVERSIONINFOEX versionInfo = {0};
|
||||
versionInfo.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX);
|
||||
versionInfo.dwMajorVersion = 6;
|
||||
versionInfo.dwMinorVersion = modifyMaxWorkload.minorVersion;
|
||||
|
||||
DWORDLONG conditionMask = 0;
|
||||
VER_SET_CONDITION(conditionMask, VER_MAJORVERSION, modifyMaxWorkload.comparisonOps);
|
||||
VER_SET_CONDITION(conditionMask, VER_MINORVERSION, modifyMaxWorkload.comparisonOps);
|
||||
|
||||
if (VerifyVersionInfo(&versionInfo, VER_MAJORVERSION | VER_MINORVERSION, conditionMask)) {
|
||||
splitSizeForWin7_ = true; // Update flag of DMA flush split size for Win 7
|
||||
if (modifyMaxWorkload.time > 0) {
|
||||
maxWorkloadTime_ = modifyMaxWorkload.time; // Update max workload time
|
||||
}
|
||||
}
|
||||
#endif // defined(_WIN32)
|
||||
|
||||
// Enable atomics support
|
||||
enableExtension(ClKhrInt64BaseAtomics);
|
||||
enableExtension(ClKhrInt64ExtendedAtomics);
|
||||
@@ -457,23 +362,19 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
|
||||
// HW doesn't support untiled image writes
|
||||
// hostMemDirectAccess_ |= HostMemImage;
|
||||
|
||||
if (doublePrecision) {
|
||||
// Report FP_FAST_FMA define if double precision HW
|
||||
reportFMA_ = true;
|
||||
// FMA is 1/4 speed on Pitcairn, Cape Verde, Devastator and Scrapper
|
||||
// Bonaire, Kalindi, Spectre and Spooky so disable
|
||||
// FP_FMA_FMAF for those parts in switch below
|
||||
reportFMAF_ = true;
|
||||
}
|
||||
// Report FP_FAST_FMA define if double precision HW
|
||||
reportFMA_ = true;
|
||||
// FMA is 1/4 speed on Pitcairn, Cape Verde, Devastator and Scrapper
|
||||
// Bonaire, Kalindi, Spectre and Spooky so disable
|
||||
// FP_FMA_FMAF for those parts in switch below
|
||||
reportFMAF_ = true;
|
||||
|
||||
// Make sure device actually supports double precision
|
||||
doublePrecision_ = (doublePrecision) ? doublePrecision_ : false;
|
||||
if (doublePrecision_) {
|
||||
// Enable KHR double precision extension
|
||||
enableExtension(ClKhrFp64);
|
||||
}
|
||||
|
||||
if (!useLightning_ && doublePrecision) {
|
||||
if (!useLightning_) {
|
||||
// Enable AMD double precision extension
|
||||
doublePrecision_ = true;
|
||||
enableExtension(ClAmdFp64);
|
||||
|
||||
@@ -70,19 +70,17 @@ class Settings : public device::Settings {
|
||||
uint gfx10Plus_ : 1; //!< gfx10 and post gfx10 features
|
||||
uint threadTraceEnable_ : 1; //!< Thread trace enable
|
||||
uint linearPersistentImage_ : 1; //!< Allocates linear images in persistent
|
||||
uint useSingleScratch_ : 1; //!< Allocates single scratch per device
|
||||
uint svmAtomics_ : 1; //!< SVM device atomics
|
||||
uint svmFineGrainSystem_ : 1; //!< SVM fine grain system support
|
||||
uint useDeviceQueue_ : 1; //!< Submit to separate device queue
|
||||
uint sdamPageFaultWar_ : 1; //!< SDMA page fault workaround
|
||||
uint rgpSqttWaitIdle_ : 1; //!< Wait for idle after SQTT trace
|
||||
uint rgpSqttForceDisable_ : 1; //!< Disables SQTT
|
||||
uint splitSizeForWin7_ : 1; //!< DMA flush split size for Win 7
|
||||
uint enableHwP2P_ : 1; //!< Forces HW P2P path for testing
|
||||
uint imageBufferWar_ : 1; //!< Image buffer workaround for Gfx10
|
||||
uint disableSdma_ : 1; //!< Disable SDMA support
|
||||
uint alwaysResident_ : 1; //!< Make resources resident at allocation time
|
||||
uint reserved_ : 7;
|
||||
uint reserved_ : 9;
|
||||
};
|
||||
uint value_;
|
||||
};
|
||||
@@ -92,9 +90,6 @@ class Settings : public device::Settings {
|
||||
uint hwLDSSize_; //!< HW local data store size
|
||||
uint maxWorkGroupSize_; //!< Requested workgroup size for this device
|
||||
uint preferredWorkGroupSize_; //!< Requested preferred workgroup size for this device
|
||||
uint workloadSplitSize_; //!< Workload split size
|
||||
uint minWorkloadTime_; //!< Minimal workload time in 0.1 ms
|
||||
uint maxWorkloadTime_; //!< Maximum workload time in 0.1 ms
|
||||
uint blitEngine_; //!< Blit engine type
|
||||
uint cacheLineSize_; //!< Cache line size in bytes
|
||||
uint cacheSize_; //!< L1 cache size in bytes
|
||||
|
||||
@@ -697,61 +697,6 @@ void VirtualGPU::MemoryDependency::clear(bool all) {
|
||||
}
|
||||
}
|
||||
|
||||
VirtualGPU::DmaFlushMgmt::DmaFlushMgmt(const Device& dev) : cbWorkload_(0), dispatchSplitSize_(0) {
|
||||
aluCnt_ = dev.properties().gfxipProperties.shaderCore.numSimdsPerCu * dev.info().simdWidth_ *
|
||||
dev.info().maxComputeUnits_;
|
||||
maxDispatchWorkload_ = static_cast<uint64_t>(dev.info().maxEngineClockFrequency_) *
|
||||
// find time in us
|
||||
dev.settings().maxWorkloadTime_ * aluCnt_;
|
||||
resetCbWorkload(dev);
|
||||
}
|
||||
|
||||
void VirtualGPU::DmaFlushMgmt::resetCbWorkload(const Device& dev) {
|
||||
cbWorkload_ = 0;
|
||||
maxCbWorkload_ = static_cast<uint64_t>(dev.info().maxEngineClockFrequency_) *
|
||||
// find time in us
|
||||
dev.settings().minWorkloadTime_ * aluCnt_;
|
||||
}
|
||||
|
||||
void VirtualGPU::DmaFlushMgmt::findSplitSize(const Device& dev, uint64_t threads,
|
||||
uint instructions) {
|
||||
if (!dev.settings().splitSizeForWin7_) {
|
||||
dispatchSplitSize_ = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
uint64_t workload = threads * instructions;
|
||||
if (maxDispatchWorkload_ < workload) {
|
||||
dispatchSplitSize_ = static_cast<uint>(maxDispatchWorkload_ / instructions);
|
||||
uint fullLoad = dev.info().maxComputeUnits_ * dev.info().preferredWorkGroupSize_;
|
||||
if ((dispatchSplitSize_ % fullLoad) != 0) {
|
||||
dispatchSplitSize_ = (dispatchSplitSize_ / fullLoad + 1) * fullLoad;
|
||||
}
|
||||
} else {
|
||||
dispatchSplitSize_ =
|
||||
(threads > dev.settings().workloadSplitSize_) ? dev.settings().workloadSplitSize_ : 0;
|
||||
}
|
||||
}
|
||||
|
||||
bool VirtualGPU::DmaFlushMgmt::isCbReady(VirtualGPU& gpu, uint64_t threads, uint instructions) {
|
||||
bool cbReady = false;
|
||||
uint64_t workload = amd::alignUp(threads, 4 * aluCnt_) * instructions;
|
||||
// Add current workload to the overall workload in the current DMA
|
||||
cbWorkload_ += workload;
|
||||
// Did it exceed maximum?
|
||||
if (cbWorkload_ > maxCbWorkload_) {
|
||||
// Reset DMA workload
|
||||
cbWorkload_ = 0;
|
||||
// Increase workload of the next DMA buffer by 50%
|
||||
maxCbWorkload_ = maxCbWorkload_ * 3 / 2;
|
||||
if (maxCbWorkload_ > maxDispatchWorkload_) {
|
||||
maxCbWorkload_ = maxDispatchWorkload_;
|
||||
}
|
||||
cbReady = true;
|
||||
}
|
||||
return cbReady;
|
||||
}
|
||||
|
||||
void VirtualGPU::addPinnedMem(amd::Memory* mem) {
|
||||
if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) {
|
||||
if (pinnedMems_.size() > 7) {
|
||||
@@ -897,7 +842,6 @@ VirtualGPU::VirtualGPU(Device& device)
|
||||
gpuDevice_(static_cast<Device&>(device)),
|
||||
printfDbgHSA_(nullptr),
|
||||
tsCache_(nullptr),
|
||||
dmaFlushMgmt_(device),
|
||||
managedBuffer_(*this, device.settings().stagedXferSize_ + 32 * Ki),
|
||||
writeBuffer_(device, managedBuffer_, device.settings().stagedXferSize_),
|
||||
hwRing_(0),
|
||||
@@ -932,11 +876,6 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
|
||||
// because destructor calls eraseResourceList() even if create() failed
|
||||
dev().resizeResoureList(index());
|
||||
|
||||
if (index() >= GPU_MAX_COMMAND_QUEUES) {
|
||||
// Cap the maximum number of concurrent Virtual GPUs
|
||||
return false;
|
||||
}
|
||||
|
||||
// Virtual GPU will have profiling enabled
|
||||
state_.profiling_ = profiling;
|
||||
|
||||
@@ -2632,16 +2571,6 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
// Add ISA memory object to the resource tracking list
|
||||
AddKernel(kernel);
|
||||
|
||||
bool needFlush = false;
|
||||
// Avoid flushing when PerfCounter is enabled, to make sure PerfStart/dispatch/PerfEnd
|
||||
// are in the same cmdBuffer
|
||||
if (!state_.perfCounterEnabled_) {
|
||||
dmaFlushMgmt_.findSplitSize(dev(), sizes.global().product(), hsaKernel.aqlCodeSize());
|
||||
if (dmaFlushMgmt().dispatchSplitSize() != 0) {
|
||||
needFlush = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Check if it is blit kernel. If it is, then check if split is needed.
|
||||
if (hsaKernel.isInternalKernel()) {
|
||||
// Calculate new group size for each submission
|
||||
@@ -2737,7 +2666,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
}
|
||||
|
||||
// Update the global GPU event
|
||||
setGpuEvent(gpuEvent, needFlush);
|
||||
constexpr bool kNeedFLush = false;
|
||||
setGpuEvent(gpuEvent, kNeedFLush);
|
||||
|
||||
if (printfEnabled && !printfDbgHSA().output(*this, printfEnabled, hsaKernel.printfInfo())) {
|
||||
LogError("Couldn't read printf data from the buffer!\n");
|
||||
@@ -2799,10 +2729,6 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
|
||||
if (!foundEvent) {
|
||||
state_.forceWait_ = true;
|
||||
}
|
||||
// If we don't have any more batches, then assume GPU is idle
|
||||
else if (cbQueue_.empty()) {
|
||||
dmaFlushMgmt_.resetCbWorkload(dev());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3325,11 +3251,8 @@ void VirtualGPU::waitEventLock(CommandBatch* cb) {
|
||||
cb->lastTS_->value(&startTimeStampGPU, &endTimeStampGPU);
|
||||
|
||||
uint64_t endTimeStampCPU = amd::Os::timeNanos();
|
||||
// Make sure the command batch has a valid GPU TS
|
||||
if (!GPU_RAW_TIMESTAMP) {
|
||||
// Adjust the base time by the execution time
|
||||
readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU;
|
||||
}
|
||||
// Adjust the base time by the execution time
|
||||
readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -290,36 +290,6 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
size_t maxMemObjectsInQueue_; //!< Maximum number of mem objects in the queue
|
||||
};
|
||||
|
||||
class DmaFlushMgmt : public amd::EmbeddedObject {
|
||||
public:
|
||||
DmaFlushMgmt(const Device& dev);
|
||||
|
||||
// Resets DMA command buffer workload
|
||||
void resetCbWorkload(const Device& dev);
|
||||
|
||||
// Finds split size for the current dispatch
|
||||
void findSplitSize(const Device& dev, //!< GPU device object
|
||||
uint64_t threads, //!< Total number of execution threads
|
||||
uint instructions //!< Number of ALU instructions
|
||||
);
|
||||
|
||||
// Returns TRUE if DMA command buffer is ready for a flush
|
||||
bool isCbReady(VirtualGPU& gpu, //!< Virtual GPU object
|
||||
uint64_t threads, //!< Total number of execution threads
|
||||
uint instructions //!< Number of ALU instructions
|
||||
);
|
||||
|
||||
// Returns dispatch split size
|
||||
uint dispatchSplitSize() const { return dispatchSplitSize_; }
|
||||
|
||||
private:
|
||||
uint64_t maxDispatchWorkload_; //!< Maximum number of operations for a single dispatch
|
||||
uint64_t maxCbWorkload_; //!< Maximum number of operations for DMA command buffer
|
||||
uint64_t cbWorkload_; //!< Current number of operations in DMA command buffer
|
||||
uint aluCnt_; //!< All ALUs on the chip
|
||||
uint dispatchSplitSize_; //!< Dispath split size in elements
|
||||
};
|
||||
|
||||
public:
|
||||
VirtualGPU(Device& device);
|
||||
//! Creates virtual gpu object
|
||||
@@ -470,9 +440,6 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
//! Returns hsaQueueMem_
|
||||
const Memory* hsaQueueMem() const { return hsaQueueMem_; }
|
||||
|
||||
//! Returns DMA flush management structure
|
||||
const DmaFlushMgmt& dmaFlushMgmt() const { return dmaFlushMgmt_; }
|
||||
|
||||
//! Returns the HW ring used on this virtual device
|
||||
uint hwRing() const { return hwRing_; }
|
||||
|
||||
@@ -695,8 +662,6 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
TimeStampCache* tsCache_; //!< TimeStamp cache
|
||||
MemoryDependency memoryDependency_; //!< Memory dependency class
|
||||
|
||||
DmaFlushMgmt dmaFlushMgmt_; //!< DMA flush management
|
||||
|
||||
std::vector<amd::Memory*> pinnedMems_; //!< Pinned memory list
|
||||
|
||||
ManagedBuffer managedBuffer_; //!< Managed write buffer
|
||||
|
||||
@@ -233,7 +233,7 @@ class NullDevice : public amd::Device {
|
||||
|
||||
//! Determine if we can use device memory for SVM
|
||||
const bool forceFineGrain(amd::Memory* memory) const {
|
||||
return !settings().enableCoarseGrainSVM_ || (memory->getContext().devices().size() > 1);
|
||||
return (memory->getContext().devices().size() > 1);
|
||||
}
|
||||
|
||||
virtual bool importExtSemaphore(void** extSemahore, const amd::Os::FileDesc& handle,
|
||||
|
||||
@@ -36,17 +36,10 @@ Settings::Settings() {
|
||||
doublePrecision_ = ::CL_KHR_FP64;
|
||||
|
||||
enableLocalMemory_ = HSA_LOCAL_MEMORY_ENABLE;
|
||||
enableCoarseGrainSVM_ = HSA_ENABLE_COARSE_GRAIN_SVM;
|
||||
|
||||
maxWorkGroupSize_ = 1024;
|
||||
preferredWorkGroupSize_ = 256;
|
||||
|
||||
maxWorkGroupSize2DX_ = 16;
|
||||
maxWorkGroupSize2DY_ = 16;
|
||||
maxWorkGroupSize3DX_ = 4;
|
||||
maxWorkGroupSize3DY_ = 4;
|
||||
maxWorkGroupSize3DZ_ = 4;
|
||||
|
||||
kernargPoolSize_ = HSA_KERNARG_POOL_SIZE;
|
||||
|
||||
// Determine if user is requesting Non-Coherent mode
|
||||
@@ -96,7 +89,6 @@ Settings::Settings() {
|
||||
cpu_wait_for_signal_ = (!flagIsDefault(ROC_CPU_WAIT_FOR_SIGNAL)) ?
|
||||
ROC_CPU_WAIT_FOR_SIGNAL : cpu_wait_for_signal_;
|
||||
system_scope_signal_ = ROC_SYSTEM_SCOPE_SIGNAL;
|
||||
skip_copy_sync_ = ROC_SKIP_COPY_SYNC;
|
||||
|
||||
// Use coarse grain system memory for kernel arguments by default (to keep GPU cache)
|
||||
fgs_kernel_arg_ = false;
|
||||
@@ -201,23 +193,6 @@ void Settings::override() {
|
||||
preferredWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE;
|
||||
}
|
||||
|
||||
if (GPU_MAX_WORKGROUP_SIZE_2D_X != 0) {
|
||||
maxWorkGroupSize2DX_ = GPU_MAX_WORKGROUP_SIZE_2D_X;
|
||||
}
|
||||
if (GPU_MAX_WORKGROUP_SIZE_2D_Y != 0) {
|
||||
maxWorkGroupSize2DY_ = GPU_MAX_WORKGROUP_SIZE_2D_Y;
|
||||
}
|
||||
|
||||
if (GPU_MAX_WORKGROUP_SIZE_3D_X != 0) {
|
||||
maxWorkGroupSize3DX_ = GPU_MAX_WORKGROUP_SIZE_3D_X;
|
||||
}
|
||||
if (GPU_MAX_WORKGROUP_SIZE_3D_Y != 0) {
|
||||
maxWorkGroupSize3DY_ = GPU_MAX_WORKGROUP_SIZE_3D_Y;
|
||||
}
|
||||
if (GPU_MAX_WORKGROUP_SIZE_3D_Z != 0) {
|
||||
maxWorkGroupSize3DZ_ = GPU_MAX_WORKGROUP_SIZE_3D_Z;
|
||||
}
|
||||
|
||||
if (!flagIsDefault(GPU_XFER_BUFFER_SIZE)) {
|
||||
xferBufSize_ = GPU_XFER_BUFFER_SIZE * Ki;
|
||||
}
|
||||
|
||||
@@ -43,7 +43,6 @@ class Settings : public device::Settings {
|
||||
struct {
|
||||
uint doublePrecision_ : 1; //!< Enables double precision support
|
||||
uint enableLocalMemory_ : 1; //!< Enable GPUVM memory
|
||||
uint enableCoarseGrainSVM_ : 1; //!< Enable device memory for coarse grain SVM allocations
|
||||
uint enableNCMode_ : 1; //!< Enable Non Coherent mode for system memory
|
||||
uint imageDMA_ : 1; //!< Enable direct image DMA transfers
|
||||
uint stagedXferRead_ : 1; //!< Uses a staged buffer read
|
||||
@@ -51,11 +50,10 @@ class Settings : public device::Settings {
|
||||
uint imageBufferWar_ : 1; //!< Image buffer workaround for Gfx10
|
||||
uint cpu_wait_for_signal_ : 1; //!< Wait for HSA signal on CPU
|
||||
uint system_scope_signal_ : 1; //!< HSA signal is visibile to the entire system
|
||||
uint skip_copy_sync_ : 1; //!< Ignore explicit HSA signal waits for copy functionality
|
||||
uint fgs_kernel_arg_ : 1; //!< Use fine grain kernel arg segment
|
||||
uint coop_sync_ : 1; //!< grid and multi-grid sync for gfx940+
|
||||
uint barrier_value_packet_ : 1; //!< Barrier value packet functionality
|
||||
uint reserved_ : 18;
|
||||
uint reserved_ : 20;
|
||||
};
|
||||
uint value_;
|
||||
};
|
||||
@@ -66,15 +64,6 @@ class Settings : public device::Settings {
|
||||
//! Preferred workgroup size
|
||||
uint preferredWorkGroupSize_;
|
||||
|
||||
//! Default max workgroup sizes for 2D
|
||||
int maxWorkGroupSize2DX_;
|
||||
int maxWorkGroupSize2DY_;
|
||||
|
||||
//! Default max workgroup sizes for 3D
|
||||
int maxWorkGroupSize3DX_;
|
||||
int maxWorkGroupSize3DY_;
|
||||
int maxWorkGroupSize3DZ_;
|
||||
|
||||
uint kernargPoolSize_;
|
||||
uint numDeviceEvents_; //!< The number of device events
|
||||
uint numWaitEvents_; //!< The number of wait events for device enqueue
|
||||
|
||||
@@ -490,7 +490,7 @@ std::vector<hsa_signal_t>& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngi
|
||||
// Check if skip wait optimization is enabled. It will try to predict the same engine in ROCr
|
||||
// and ignore the signal wait, relying on in-order engine execution
|
||||
const Settings& settings = gpu_.dev().settings();
|
||||
if (!settings.skip_copy_sync_ && (engine != HwQueueEngine::Compute)) {
|
||||
if (engine != HwQueueEngine::Compute) {
|
||||
explicit_wait = true;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -30,22 +30,10 @@ release(uint, AMD_LOG_MASK, 0X7FFFFFFF, \
|
||||
"The mask to enable specific kinds of logs") \
|
||||
debug(uint, DEBUG_GPU_FLAGS, 0, \
|
||||
"The debug options for GPU device") \
|
||||
release(uint, GPU_MAX_COMMAND_QUEUES, 300, \
|
||||
"The maximum number of concurrent Virtual GPUs") \
|
||||
release(size_t, CQ_THREAD_STACK_SIZE, 256*Ki, /* @todo: that much! */ \
|
||||
"The default command queue thread stack size") \
|
||||
release(int, GPU_MAX_WORKGROUP_SIZE, 0, \
|
||||
"Maximum number of workitems in a workgroup for GPU, 0 -use default") \
|
||||
release(int, GPU_MAX_WORKGROUP_SIZE_2D_X, 0, \
|
||||
"Maximum number of workitems in a 2D workgroup for GPU, x component, 0 -use default") \
|
||||
release(int, GPU_MAX_WORKGROUP_SIZE_2D_Y, 0, \
|
||||
"Maximum number of workitems in a 2D workgroup for GPU, y component, 0 -use default") \
|
||||
release(int, GPU_MAX_WORKGROUP_SIZE_3D_X, 0, \
|
||||
"Maximum number of workitems in a 3D workgroup for GPU, x component, 0 -use default") \
|
||||
release(int, GPU_MAX_WORKGROUP_SIZE_3D_Y, 0, \
|
||||
"Maximum number of workitems in a 3D workgroup for GPU, y component, 0 -use default") \
|
||||
release(int, GPU_MAX_WORKGROUP_SIZE_3D_Z, 0, \
|
||||
"Maximum number of workitems in a 3D workgroup for GPU, z component, 0 -use default") \
|
||||
debug(bool, CPU_MEMORY_GUARD_PAGES, false, \
|
||||
"Use guard pages for CPU memory") \
|
||||
debug(size_t, CPU_MEMORY_GUARD_PAGE_SIZE, 64, \
|
||||
@@ -74,8 +62,6 @@ release(uint, GPU_BLIT_ENGINE_TYPE, 0x0, \
|
||||
"Blit engine type: 0 - Default, 1 - Host, 2 - CAL, 3 - Kernel") \
|
||||
release(bool, GPU_FLUSH_ON_EXECUTION, false, \
|
||||
"Submit commands to HW on every operation. 0 - Disable, 1 - Enable") \
|
||||
release(bool, GPU_USE_SYNC_OBJECTS, true, \
|
||||
"If enabled, use sync objects instead of polling") \
|
||||
release(bool, CL_KHR_FP64, true, \
|
||||
"Enable/Disable support for double precision") \
|
||||
release(cstring, AMD_OCL_BUILD_OPTIONS, 0, \
|
||||
@@ -86,12 +72,8 @@ release(cstring, AMD_OCL_LINK_OPTIONS, 0, \
|
||||
"Set clLinkProgram()'s options (override)") \
|
||||
release(cstring, AMD_OCL_LINK_OPTIONS_APPEND, 0, \
|
||||
"Append clLinkProgram()'s options") \
|
||||
release(cstring, AMD_OCL_SC_LIB, 0, \
|
||||
"Set shader compiler shared library name or path") \
|
||||
debug(cstring, AMD_OCL_SUBST_OBJFILE, 0, \
|
||||
"Specify binary substitution config file for OpenCL") \
|
||||
debug(bool, AMD_OCL_ENABLE_MESSAGE_BOX, false, \
|
||||
"Enable the error dialog on Windows") \
|
||||
release(size_t, GPU_PINNED_XFER_SIZE, 32, \
|
||||
"The pinned buffer size for pinning in read/write transfers in MiB") \
|
||||
release(size_t, GPU_PINNED_MIN_XFER_SIZE, 128, \
|
||||
@@ -100,12 +82,6 @@ release(size_t, GPU_RESOURCE_CACHE_SIZE, 64, \
|
||||
"The resource cache size in MB") \
|
||||
release(size_t, GPU_MAX_SUBALLOC_SIZE, 4096, \
|
||||
"The maximum size accepted for suballocaitons in KB") \
|
||||
release(bool, GPU_FORCE_64BIT_PTR, 0, \
|
||||
"Forces 64 bit pointers on GPU") \
|
||||
release(bool, GPU_FORCE_OCL20_32BIT, 0, \
|
||||
"Forces 32 bit apps to take CLANG\HSAIL path") \
|
||||
release(bool, GPU_RAW_TIMESTAMP, 0, \
|
||||
"Reports GPU raw timestamps in GPU timeline") \
|
||||
release(size_t, GPU_NUM_MEM_DEPENDENCY, 256, \
|
||||
"Number of memory objects for dependency tracking") \
|
||||
release(size_t, GPU_XFER_BUFFER_SIZE, 0, \
|
||||
@@ -116,32 +92,20 @@ release(uint, GPU_SINGLE_ALLOC_PERCENT, 85, \
|
||||
"Maximum size of a single allocation as percentage of total") \
|
||||
release(uint, GPU_NUM_COMPUTE_RINGS, 2, \
|
||||
"GPU number of compute rings. 0 - disabled, 1 , 2,.. - the number of compute rings") \
|
||||
release(int, GPU_SELECT_COMPUTE_RINGS_ID, -1, \
|
||||
"GPU select the compute rings ID -1 - disabled, 0 , 1,.. - the forced compute rings ID for submission") \
|
||||
release(uint, GPU_WORKLOAD_SPLIT, 22, \
|
||||
"Workload split size") \
|
||||
release(bool, GPU_USE_SINGLE_SCRATCH, false, \
|
||||
"Use single scratch buffer per device instead of per HW ring") \
|
||||
release(bool, AMD_OCL_WAIT_COMMAND, false, \
|
||||
"1 = Enable a wait for every submitted command") \
|
||||
release(uint, GPU_PRINT_CHILD_KERNEL, 0, \
|
||||
"Prints the specified number of the child kernels") \
|
||||
release(bool, GPU_USE_DEVICE_QUEUE, false, \
|
||||
"Use a dedicated device queue for the actual submissions") \
|
||||
release(bool, GPU_ENABLE_LARGE_ALLOCATION, true, \
|
||||
"Enable >4GB single allocations") \
|
||||
release(bool, AMD_THREAD_TRACE_ENABLE, true, \
|
||||
"Enable thread trace extension") \
|
||||
release(uint, OPENCL_VERSION, (IS_BRAHMA ? 120 : 200), \
|
||||
release(uint, OPENCL_VERSION, 200, \
|
||||
"Force GPU opencl verison") \
|
||||
release(bool, HSA_LOCAL_MEMORY_ENABLE, true, \
|
||||
"Enable HSA device local memory usage") \
|
||||
release(uint, HSA_KERNARG_POOL_SIZE, 1024 * 1024, \
|
||||
"Kernarg pool size") \
|
||||
release(bool, HSA_ENABLE_COARSE_GRAIN_SVM, true, \
|
||||
"Enable device memory for coarse grain SVM allocations") \
|
||||
release(bool, GPU_IFH_MODE, false, \
|
||||
"1 = Enable GPU IFH (infinitely fast hardware) mode. Any other value keeps setting disabled.") \
|
||||
release(bool, GPU_MIPMAP, true, \
|
||||
"Enables GPU mipmap extension") \
|
||||
release(uint, GPU_ENABLE_PAL, 2, \
|
||||
@@ -152,8 +116,6 @@ release(int, AMD_GPU_FORCE_SINGLE_FP_DENORM, -1, \
|
||||
"Force denorm for single precision: -1 - don't force, 0 - disable, 1 - enable") \
|
||||
release(uint, OCL_SET_SVM_SIZE, 4*16384, \
|
||||
"set SVM space size for discrete GPU") \
|
||||
debug(uint, OCL_SYSMEM_REQUIREMENT, 2, \
|
||||
"Use flag to change the minimum requirement of system memory not to downgrade") \
|
||||
release(uint, GPU_WAVES_PER_SIMD, 0, \
|
||||
"Force the number of waves per SIMD (1-10)") \
|
||||
release(bool, GPU_WAVE_LIMIT_ENABLE, false, \
|
||||
@@ -176,10 +138,6 @@ release_on_stg(cstring, GPU_WAVE_LIMIT_DUMP, "", \
|
||||
"File path prefix for dumping wave limiter output") \
|
||||
release_on_stg(cstring, GPU_WAVE_LIMIT_TRACE, "", \
|
||||
"File path prefix for tracing wave limiter") \
|
||||
release(bool, OCL_CODE_CACHE_ENABLE, false, \
|
||||
"1 = Enable compiler code cache") \
|
||||
release(bool, OCL_CODE_CACHE_RESET, false, \
|
||||
"1 = Reset the compiler code cache storage") \
|
||||
release(bool, PAL_DISABLE_SDMA, false, \
|
||||
"1 = Disable SDMA for PAL") \
|
||||
release(uint, PAL_RGP_DISP_COUNT, 10000, \
|
||||
@@ -243,10 +201,6 @@ release(bool, ROC_CPU_WAIT_FOR_SIGNAL, true, \
|
||||
"Enable CPU wait for dependent HSA signals.") \
|
||||
release(bool, ROC_SYSTEM_SCOPE_SIGNAL, true, \
|
||||
"Enable system scope for signals (uses interrupts).") \
|
||||
release(bool, ROC_SKIP_COPY_SYNC, false, \
|
||||
"Skips copy syncs if runtime can predict the same engine.") \
|
||||
release(bool, ROC_ENABLE_PRE_VEGA, false, \
|
||||
"Enable support of pre-vega ASICs in ROCm path") \
|
||||
release(bool, GPU_FORCE_QUEUE_PROFILING, false, \
|
||||
"Force command queue profiling by default") \
|
||||
release(bool, HIP_MEM_POOL_SUPPORT, false, \
|
||||
|
||||
@@ -178,12 +178,6 @@
|
||||
#define ALWAYSINLINE
|
||||
#endif // !_MSC_VER
|
||||
|
||||
#ifdef BRAHMA
|
||||
#define IS_BRAHMA true
|
||||
#else
|
||||
#define IS_BRAHMA false
|
||||
#endif
|
||||
|
||||
//! \endcond
|
||||
|
||||
#endif // MACROS_HPP_
|
||||
|
||||
Referencia en una nueva incidencia
Block a user