From 3f4bbcfdba1aa947ca3cd2d22c85cb36613e2274 Mon Sep 17 00:00:00 2001 From: German Date: Wed, 23 Aug 2023 13:35:56 -0400 Subject: [PATCH] SWDEV-407533 - [ABI Break]Purge unused env vars Change-Id: I627950e8ebb6299affc602754a20d442dbe42b14 [ROCm/clr commit: 077311153a251ae0fc6b99a66944a66cc3abe8da] --- projects/clr/rocclr/device/device.cpp | 14 -- projects/clr/rocclr/device/device.hpp | 8 +- projects/clr/rocclr/device/devkernel.cpp | 148 ++++++++---------- projects/clr/rocclr/device/pal/paldevice.cpp | 4 +- .../clr/rocclr/device/pal/palsettings.cpp | 121 ++------------ .../clr/rocclr/device/pal/palsettings.hpp | 7 +- projects/clr/rocclr/device/pal/palvirtual.cpp | 85 +--------- projects/clr/rocclr/device/pal/palvirtual.hpp | 35 ----- projects/clr/rocclr/device/rocm/rocdevice.hpp | 2 +- .../clr/rocclr/device/rocm/rocsettings.cpp | 25 --- .../clr/rocclr/device/rocm/rocsettings.hpp | 13 +- .../clr/rocclr/device/rocm/rocvirtual.cpp | 2 +- projects/clr/rocclr/utils/flags.hpp | 48 +----- projects/clr/rocclr/utils/macros.hpp | 6 - 14 files changed, 88 insertions(+), 430 deletions(-) diff --git a/projects/clr/rocclr/device/device.cpp b/projects/clr/rocclr/device/device.cpp index bf88996f47..0249f31d6d 100644 --- a/projects/clr/rocclr/device/device.cpp +++ b/projects/clr/rocclr/device/device.cpp @@ -933,20 +933,6 @@ Settings::Settings() : value_(0) { customHostAllocator_ = false; waitCommand_ = AMD_OCL_WAIT_COMMAND; supportDepthsRGB_ = false; - commandQueues_ = 200; //!< Field value set to maximum number - //!< concurrent Virtual GPUs for default - - overrideLclSet = (!flagIsDefault(GPU_MAX_WORKGROUP_SIZE)) ? 1 : 0; - overrideLclSet |= - (!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) || !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y)) - ? 2 - : 0; - overrideLclSet |= - (!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) || !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) || - !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z)) - ? 4 - : 0; - fenceScopeAgent_ = AMD_OPT_FLUSH; if (amd::IS_HIP) { if (flagIsDefault(GPU_SINGLE_ALLOC_PERCENT)) { diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp index 5074c6fb72..86e3caff1a 100644 --- a/projects/clr/rocclr/device/device.hpp +++ b/projects/clr/rocclr/device/device.hpp @@ -640,7 +640,6 @@ class Settings : public amd::HeapObject { uint64_t extensions_; //!< Supported OCL extensions union { struct { - uint overrideLclSet : 3; //!< Bit mask to override the local size uint apuSystem_ : 1; //!< Device is APU system with shared memory uint supportRA_ : 1; //!< Support RA channel order format uint waitCommand_ : 1; //!< Enables a wait for every submitted command @@ -660,14 +659,11 @@ class Settings : public amd::HeapObject { uint enableCoopMultiDeviceGroups_ : 1; //!< Enable cooperative groups multi device uint fenceScopeAgent_ : 1; //!< Enable fence scope agent in AQL dispatch packet uint rocr_backend_ : 1; //!< Device uses ROCr backend for submissions - uint reserved_ : 11; + uint reserved_ : 14; }; uint value_; }; - uint commandQueues_; //!< Field value for maximum number - //!< concurrent Virtual GPUs for each backend - //! Default constructor Settings(); @@ -1383,7 +1379,7 @@ class Isa { /// @returns If the ROCm runtime supports the ISA. bool runtimeRocSupported() const { - if (!IS_HIP && !ROC_ENABLE_PRE_VEGA && (versionMajor_ == 8)) { + if (!IS_HIP && (versionMajor_ == 8)) { return false; } return runtimeRocSupported_; diff --git a/projects/clr/rocclr/device/devkernel.cpp b/projects/clr/rocclr/device/devkernel.cpp index 4bddff1982..db2bdfb81c 100644 --- a/projects/clr/rocclr/device/devkernel.cpp +++ b/projects/clr/rocclr/device/devkernel.cpp @@ -684,102 +684,82 @@ void Kernel::FindLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize, if (workGroupInfo()->compileSize_[0] == 0) { // Find the default local workgroup size, if it wasn't specified if (lclWorkSize[0] == 0) { - if ((device().settings().overrideLclSet & (1 << (workDim - 1))) == 0) { - // Find threads per group - size_t thrPerGrp = workGroupInfo()->size_; + // Find threads per group + size_t thrPerGrp = workGroupInfo()->size_; - // Check if kernel uses images - if (flags_.imageEna_ && - // and thread group is a multiple value of wavefronts - ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) && - // and it's 2 or 3-dimensional workload - (workDim > 1) && (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0))) { - // Use 8x8 workgroup size if kernel has image writes - if (flags_.imageWriteEna_ || (thrPerGrp != device().info().preferredWorkGroupSize_)) { - lclWorkSize[0] = 8; - lclWorkSize[1] = 8; - } - else { - lclWorkSize[0] = 16; - lclWorkSize[1] = 16; - } - if (workDim == 3) { - lclWorkSize[2] = 1; - } + // Check if kernel uses images + if (flags_.imageEna_ && + // and thread group is a multiple value of wavefronts + ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) && + // and it's 2 or 3-dimensional workload + (workDim > 1) && (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0))) { + // Use 8x8 workgroup size if kernel has image writes + if (flags_.imageWriteEna_ || (thrPerGrp != device().info().preferredWorkGroupSize_)) { + lclWorkSize[0] = 8; + lclWorkSize[1] = 8; } else { - size_t tmp = thrPerGrp; - // Split the local workgroup into the most efficient way - for (uint d = 0; d < workDim; ++d) { - size_t div = tmp; - for (; (gblWorkSize[d] % div) != 0; div--) - ; - lclWorkSize[d] = div; - tmp /= div; - } + lclWorkSize[0] = 16; + lclWorkSize[1] = 16; + } + if (workDim == 3) { + lclWorkSize[2] = 1; + } + } + else { + size_t tmp = thrPerGrp; + // Split the local workgroup into the most efficient way + for (uint d = 0; d < workDim; ++d) { + size_t div = tmp; + for (; (gblWorkSize[d] % div) != 0; div--) + ; + lclWorkSize[d] = div; + tmp /= div; + } - if (!workGroupInfo()->uniformWorkGroupSize_) { - // Assuming DWORD access - const uint cacheLineMatch = device().info().globalMemCacheLineSize_ >> 2; + if (!workGroupInfo()->uniformWorkGroupSize_) { + // Assuming DWORD access + const uint cacheLineMatch = device().info().globalMemCacheLineSize_ >> 2; - // Check if we couldn't find optimal workload - if (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) || - // or size is too small for the cache line - (lclWorkSize[0] < cacheLineMatch)) { - size_t maxSize = 0; - size_t maxDim = 0; + // Check if we couldn't find optimal workload + if (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) || + // or size is too small for the cache line + (lclWorkSize[0] < cacheLineMatch)) { + size_t maxSize = 0; + size_t maxDim = 0; + for (uint d = 0; d < workDim; ++d) { + if (maxSize < gblWorkSize[d]) { + maxSize = gblWorkSize[d]; + maxDim = d; + } + } + // Use X dimension as high priority. Runtime will assume that + // X dimension is more important for the address calculation + if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) { + lclWorkSize[0] = cacheLineMatch; + thrPerGrp /= cacheLineMatch; + lclWorkSize[maxDim] = thrPerGrp; + for (uint d = 1; d < workDim; ++d) { + if (d != maxDim) { + lclWorkSize[d] = 1; + } + } + } + else { + // Check if a local workgroup has the most optimal size + if (thrPerGrp > maxSize) { + thrPerGrp = maxSize; + } + lclWorkSize[maxDim] = thrPerGrp; for (uint d = 0; d < workDim; ++d) { - if (maxSize < gblWorkSize[d]) { - maxSize = gblWorkSize[d]; - maxDim = d; - } - } - // Use X dimension as high priority. Runtime will assume that - // X dimension is more important for the address calculation - if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) { - lclWorkSize[0] = cacheLineMatch; - thrPerGrp /= cacheLineMatch; - lclWorkSize[maxDim] = thrPerGrp; - for (uint d = 1; d < workDim; ++d) { - if (d != maxDim) { - lclWorkSize[d] = 1; - } - } - } - else { - // Check if a local workgroup has the most optimal size - if (thrPerGrp > maxSize) { - thrPerGrp = maxSize; - } - lclWorkSize[maxDim] = thrPerGrp; - for (uint d = 0; d < workDim; ++d) { - if (d != maxDim) { - lclWorkSize[d] = 1; - } + if (d != maxDim) { + lclWorkSize[d] = 1; } } } } } } - else { - // Use overrides when app doesn't provide workgroup dimensions - if (workDim == 1) { - lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE; - } - else if (workDim == 2) { - lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X; - lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y; - } - else if (workDim == 3) { - lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X; - lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y; - lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z; - } - else { - assert(0 && "Invalid workDim!"); - } - } } } else { diff --git a/projects/clr/rocclr/device/pal/paldevice.cpp b/projects/clr/rocclr/device/pal/paldevice.cpp index 55585f3d34..d79daf71b8 100644 --- a/projects/clr/rocclr/device/pal/paldevice.cpp +++ b/projects/clr/rocclr/device/pal/paldevice.cpp @@ -300,7 +300,7 @@ bool NullDevice::create(const char* palName, const amd::Isa& isa, Pal::GfxIpLeve nullptr, nullptr, nullptr, - AMD_OCL_SC_LIB}; + nullptr}; // Initialize the compiler handle acl_error error; compiler_ = amd::Hsail::CompilerInit(&opts, &error); @@ -1029,7 +1029,7 @@ bool Device::create(Pal::IDevice* device) { nullptr, nullptr, nullptr, - AMD_OCL_SC_LIB}; + nullptr}; // Initialize the compiler handle acl_error error; compiler_ = amd::Hsail::CompilerInit(&opts, &error); diff --git a/projects/clr/rocclr/device/pal/palsettings.cpp b/projects/clr/rocclr/device/pal/palsettings.cpp index 41d03af263..877498059a 100644 --- a/projects/clr/rocclr/device/pal/palsettings.cpp +++ b/projects/clr/rocclr/device/pal/palsettings.cpp @@ -33,19 +33,6 @@ namespace pal { -/*! \brief information for adjusting maximum workload time - * - * This structure contains the time and OS minor version for max workload time - * adjustment for Windows 7 or 8. - */ -struct ModifyMaxWorkload { - uint32_t time; //!< max work load time (10x ms) - uint32_t minorVersion; //!< OS minor version -#if defined(_WIN32) - BYTE comparisonOps; //!< Comparison option -#endif -}; - Settings::Settings() { // Initialize the GPU device default settings oclVersion_ = OpenCL12; @@ -73,9 +60,6 @@ Settings::Settings() { libSelector_ = amd::LibraryUndefined; - // Enable workload split by default (for 24 bit arithmetic or timeout) - workloadSplitSize_ = 1 << GPU_WORKLOAD_SPLIT; - // By default use host blit blitEngine_ = BlitEngineHost; pinnedXferSize_ = GPU_PINNED_XFER_SIZE * Mi; @@ -117,15 +101,10 @@ Settings::Settings() { // Number of compute rings. numComputeRings_ = 0; - minWorkloadTime_ = 1; // 0.1 ms - maxWorkloadTime_ = 500000; // 500 ms - // Controls tiled images in persistent //!@note IOL for Linux doesn't setup tiling aperture in CMM/QS linearPersistentImage_ = false; - useSingleScratch_ = GPU_USE_SINGLE_SCRATCH; - // Device enqueuing settings numDeviceEvents_ = 1024; numWaitEvents_ = 8; @@ -177,14 +156,6 @@ bool Settings::create(const Pal::DeviceProperties& palProp, // Disable thread trace by default for all devices threadTraceEnable_ = false; - bool doublePrecision = true; - - // Update GPU specific settings and info structure if we have any -#if defined(_WIN32) - ModifyMaxWorkload modifyMaxWorkload = {0, 1, VER_EQUAL}; -#else - ModifyMaxWorkload modifyMaxWorkload = {0}; -#endif // APU systems if (palProp.gpuType == Pal::GpuType::Integrated) { @@ -250,14 +221,6 @@ bool Settings::create(const Pal::DeviceProperties& palProp, // GFX10.1 HW doesn't support custom pitch. Enable double copy workaround imageBufferWar_ = GPU_IMAGE_BUFFER_WAR; } - if (false) { - // UnknownDevice0 HW doesn't have SDMA engine - disableSdma_ = true; - // And LDS is limited to 32KB - hwLDSSize_ = 32 * Ki; - // No fp64 support - doublePrecision = false; - } // Fall through to AI (gfx9) ... case Pal::AsicRevision::Vega20: // Enable HW P2P path for Vega20+. Runtime still relies on KMD/PAL for support @@ -277,15 +240,6 @@ bool Settings::create(const Pal::DeviceProperties& palProp, case Pal::AsicRevision::Carrizo: case Pal::AsicRevision::Bristol: case Pal::AsicRevision::Stoney: - if (!aiPlus_) { - // Fix BSOD/TDR issues observed on Stoney Win7/8.1/10 - minWorkloadTime_ = 1000; - modifyMaxWorkload.time = 1000; // Decided by experiment - modifyMaxWorkload.minorVersion = 1; // Win 7 -#if defined(_WIN32) - modifyMaxWorkload.comparisonOps = VER_EQUAL; // Limit to Win 7 only -#endif - } case Pal::AsicRevision::Iceland: case Pal::AsicRevision::Tonga: case Pal::AsicRevision::Fiji: @@ -307,15 +261,6 @@ bool Settings::create(const Pal::DeviceProperties& palProp, case Pal::AsicRevision::Godavari: case Pal::AsicRevision::Spectre: case Pal::AsicRevision::Spooky: - if (!viPlus_) { - // Fix BSOD/TDR issues observed on Kaveri Win7 (EPR#416903) - modifyMaxWorkload.time = 250000; // 250ms - modifyMaxWorkload.minorVersion = 1; // Win 7 -#if defined(_WIN32) - modifyMaxWorkload.comparisonOps = VER_EQUAL; // limit to Win 7 -#endif - } - // Fall through ... case Pal::AsicRevision::Bonaire: case Pal::AsicRevision::Hawaii: case Pal::AsicRevision::HawaiiPro: @@ -331,13 +276,7 @@ bool Settings::create(const Pal::DeviceProperties& palProp, libSelector_ = amd::GPU_Library_CI; if (LP64_SWITCH(false, true)) { - oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/ - ? XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR)) - : OpenCL12; - } - if (GPU_FORCE_OCL20_32BIT) { - force32BitOcl20_ = true; - oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/ + oclVersion_ = !reportAsOCL12Device ? XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR)) : OpenCL12; } @@ -348,28 +287,14 @@ bool Settings::create(const Pal::DeviceProperties& palProp, // Cap at OpenCL20 for now if (oclVersion_ > OpenCL20) oclVersion_ = OpenCL20; - - // This needs to be cleaned once 64bit addressing is stable - if (oclVersion_ < OpenCL20) { - use64BitPtr_ = flagIsDefault(GPU_FORCE_64BIT_PTR) - ? LP64_SWITCH(false, - /*calAttr.isWorkstation ||*/ true) - : GPU_FORCE_64BIT_PTR; - } else { - if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, true)) { - use64BitPtr_ = true; - } - } + + use64BitPtr_ = LP64_SWITCH(false, true); if (oclVersion_ >= OpenCL20) { supportDepthsRGB_ = true; } if (use64BitPtr_) { - if (GPU_ENABLE_LARGE_ALLOCATION) { - maxAllocSize_ = 64ULL * Gi; - } else { - maxAllocSize_ = 4048 * Mi; - } + maxAllocSize_ = 64ULL * Gi; } else { maxAllocSize_ = 3ULL * Gi; } @@ -395,26 +320,6 @@ bool Settings::create(const Pal::DeviceProperties& palProp, // Image DMA must be disabled if SDMA is disabled imageDMA_ &= !disableSdma_; - splitSizeForWin7_ = false; - -#if defined(_WIN32) - OSVERSIONINFOEX versionInfo = {0}; - versionInfo.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX); - versionInfo.dwMajorVersion = 6; - versionInfo.dwMinorVersion = modifyMaxWorkload.minorVersion; - - DWORDLONG conditionMask = 0; - VER_SET_CONDITION(conditionMask, VER_MAJORVERSION, modifyMaxWorkload.comparisonOps); - VER_SET_CONDITION(conditionMask, VER_MINORVERSION, modifyMaxWorkload.comparisonOps); - - if (VerifyVersionInfo(&versionInfo, VER_MAJORVERSION | VER_MINORVERSION, conditionMask)) { - splitSizeForWin7_ = true; // Update flag of DMA flush split size for Win 7 - if (modifyMaxWorkload.time > 0) { - maxWorkloadTime_ = modifyMaxWorkload.time; // Update max workload time - } - } -#endif // defined(_WIN32) - // Enable atomics support enableExtension(ClKhrInt64BaseAtomics); enableExtension(ClKhrInt64ExtendedAtomics); @@ -457,23 +362,19 @@ bool Settings::create(const Pal::DeviceProperties& palProp, // HW doesn't support untiled image writes // hostMemDirectAccess_ |= HostMemImage; - if (doublePrecision) { - // Report FP_FAST_FMA define if double precision HW - reportFMA_ = true; - // FMA is 1/4 speed on Pitcairn, Cape Verde, Devastator and Scrapper - // Bonaire, Kalindi, Spectre and Spooky so disable - // FP_FMA_FMAF for those parts in switch below - reportFMAF_ = true; - } + // Report FP_FAST_FMA define if double precision HW + reportFMA_ = true; + // FMA is 1/4 speed on Pitcairn, Cape Verde, Devastator and Scrapper + // Bonaire, Kalindi, Spectre and Spooky so disable + // FP_FMA_FMAF for those parts in switch below + reportFMAF_ = true; - // Make sure device actually supports double precision - doublePrecision_ = (doublePrecision) ? doublePrecision_ : false; if (doublePrecision_) { // Enable KHR double precision extension enableExtension(ClKhrFp64); } - if (!useLightning_ && doublePrecision) { + if (!useLightning_) { // Enable AMD double precision extension doublePrecision_ = true; enableExtension(ClAmdFp64); diff --git a/projects/clr/rocclr/device/pal/palsettings.hpp b/projects/clr/rocclr/device/pal/palsettings.hpp index 66984622ee..32d3ad2a14 100644 --- a/projects/clr/rocclr/device/pal/palsettings.hpp +++ b/projects/clr/rocclr/device/pal/palsettings.hpp @@ -70,19 +70,17 @@ class Settings : public device::Settings { uint gfx10Plus_ : 1; //!< gfx10 and post gfx10 features uint threadTraceEnable_ : 1; //!< Thread trace enable uint linearPersistentImage_ : 1; //!< Allocates linear images in persistent - uint useSingleScratch_ : 1; //!< Allocates single scratch per device uint svmAtomics_ : 1; //!< SVM device atomics uint svmFineGrainSystem_ : 1; //!< SVM fine grain system support uint useDeviceQueue_ : 1; //!< Submit to separate device queue uint sdamPageFaultWar_ : 1; //!< SDMA page fault workaround uint rgpSqttWaitIdle_ : 1; //!< Wait for idle after SQTT trace uint rgpSqttForceDisable_ : 1; //!< Disables SQTT - uint splitSizeForWin7_ : 1; //!< DMA flush split size for Win 7 uint enableHwP2P_ : 1; //!< Forces HW P2P path for testing uint imageBufferWar_ : 1; //!< Image buffer workaround for Gfx10 uint disableSdma_ : 1; //!< Disable SDMA support uint alwaysResident_ : 1; //!< Make resources resident at allocation time - uint reserved_ : 7; + uint reserved_ : 9; }; uint value_; }; @@ -92,9 +90,6 @@ class Settings : public device::Settings { uint hwLDSSize_; //!< HW local data store size uint maxWorkGroupSize_; //!< Requested workgroup size for this device uint preferredWorkGroupSize_; //!< Requested preferred workgroup size for this device - uint workloadSplitSize_; //!< Workload split size - uint minWorkloadTime_; //!< Minimal workload time in 0.1 ms - uint maxWorkloadTime_; //!< Maximum workload time in 0.1 ms uint blitEngine_; //!< Blit engine type uint cacheLineSize_; //!< Cache line size in bytes uint cacheSize_; //!< L1 cache size in bytes diff --git a/projects/clr/rocclr/device/pal/palvirtual.cpp b/projects/clr/rocclr/device/pal/palvirtual.cpp index 234ef67882..acd9fba3be 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/device/pal/palvirtual.cpp @@ -697,61 +697,6 @@ void VirtualGPU::MemoryDependency::clear(bool all) { } } -VirtualGPU::DmaFlushMgmt::DmaFlushMgmt(const Device& dev) : cbWorkload_(0), dispatchSplitSize_(0) { - aluCnt_ = dev.properties().gfxipProperties.shaderCore.numSimdsPerCu * dev.info().simdWidth_ * - dev.info().maxComputeUnits_; - maxDispatchWorkload_ = static_cast(dev.info().maxEngineClockFrequency_) * - // find time in us - dev.settings().maxWorkloadTime_ * aluCnt_; - resetCbWorkload(dev); -} - -void VirtualGPU::DmaFlushMgmt::resetCbWorkload(const Device& dev) { - cbWorkload_ = 0; - maxCbWorkload_ = static_cast(dev.info().maxEngineClockFrequency_) * - // find time in us - dev.settings().minWorkloadTime_ * aluCnt_; -} - -void VirtualGPU::DmaFlushMgmt::findSplitSize(const Device& dev, uint64_t threads, - uint instructions) { - if (!dev.settings().splitSizeForWin7_) { - dispatchSplitSize_ = 0; - return; - } - - uint64_t workload = threads * instructions; - if (maxDispatchWorkload_ < workload) { - dispatchSplitSize_ = static_cast(maxDispatchWorkload_ / instructions); - uint fullLoad = dev.info().maxComputeUnits_ * dev.info().preferredWorkGroupSize_; - if ((dispatchSplitSize_ % fullLoad) != 0) { - dispatchSplitSize_ = (dispatchSplitSize_ / fullLoad + 1) * fullLoad; - } - } else { - dispatchSplitSize_ = - (threads > dev.settings().workloadSplitSize_) ? dev.settings().workloadSplitSize_ : 0; - } -} - -bool VirtualGPU::DmaFlushMgmt::isCbReady(VirtualGPU& gpu, uint64_t threads, uint instructions) { - bool cbReady = false; - uint64_t workload = amd::alignUp(threads, 4 * aluCnt_) * instructions; - // Add current workload to the overall workload in the current DMA - cbWorkload_ += workload; - // Did it exceed maximum? - if (cbWorkload_ > maxCbWorkload_) { - // Reset DMA workload - cbWorkload_ = 0; - // Increase workload of the next DMA buffer by 50% - maxCbWorkload_ = maxCbWorkload_ * 3 / 2; - if (maxCbWorkload_ > maxDispatchWorkload_) { - maxCbWorkload_ = maxDispatchWorkload_; - } - cbReady = true; - } - return cbReady; -} - void VirtualGPU::addPinnedMem(amd::Memory* mem) { if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) { if (pinnedMems_.size() > 7) { @@ -897,7 +842,6 @@ VirtualGPU::VirtualGPU(Device& device) gpuDevice_(static_cast(device)), printfDbgHSA_(nullptr), tsCache_(nullptr), - dmaFlushMgmt_(device), managedBuffer_(*this, device.settings().stagedXferSize_ + 32 * Ki), writeBuffer_(device, managedBuffer_, device.settings().stagedXferSize_), hwRing_(0), @@ -932,11 +876,6 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs, // because destructor calls eraseResourceList() even if create() failed dev().resizeResoureList(index()); - if (index() >= GPU_MAX_COMMAND_QUEUES) { - // Cap the maximum number of concurrent Virtual GPUs - return false; - } - // Virtual GPU will have profiling enabled state_.profiling_ = profiling; @@ -2632,16 +2571,6 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const // Add ISA memory object to the resource tracking list AddKernel(kernel); - bool needFlush = false; - // Avoid flushing when PerfCounter is enabled, to make sure PerfStart/dispatch/PerfEnd - // are in the same cmdBuffer - if (!state_.perfCounterEnabled_) { - dmaFlushMgmt_.findSplitSize(dev(), sizes.global().product(), hsaKernel.aqlCodeSize()); - if (dmaFlushMgmt().dispatchSplitSize() != 0) { - needFlush = true; - } - } - // Check if it is blit kernel. If it is, then check if split is needed. if (hsaKernel.isInternalKernel()) { // Calculate new group size for each submission @@ -2737,7 +2666,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const } // Update the global GPU event - setGpuEvent(gpuEvent, needFlush); + constexpr bool kNeedFLush = false; + setGpuEvent(gpuEvent, kNeedFLush); if (printfEnabled && !printfDbgHSA().output(*this, printfEnabled, hsaKernel.printfInfo())) { LogError("Couldn't read printf data from the buffer!\n"); @@ -2799,10 +2729,6 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) { if (!foundEvent) { state_.forceWait_ = true; } - // If we don't have any more batches, then assume GPU is idle - else if (cbQueue_.empty()) { - dmaFlushMgmt_.resetCbWorkload(dev()); - } } } @@ -3325,11 +3251,8 @@ void VirtualGPU::waitEventLock(CommandBatch* cb) { cb->lastTS_->value(&startTimeStampGPU, &endTimeStampGPU); uint64_t endTimeStampCPU = amd::Os::timeNanos(); - // Make sure the command batch has a valid GPU TS - if (!GPU_RAW_TIMESTAMP) { - // Adjust the base time by the execution time - readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU; - } + // Adjust the base time by the execution time + readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU; } } } diff --git a/projects/clr/rocclr/device/pal/palvirtual.hpp b/projects/clr/rocclr/device/pal/palvirtual.hpp index 25c5f96d66..2595342c74 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.hpp +++ b/projects/clr/rocclr/device/pal/palvirtual.hpp @@ -290,36 +290,6 @@ class VirtualGPU : public device::VirtualDevice { size_t maxMemObjectsInQueue_; //!< Maximum number of mem objects in the queue }; - class DmaFlushMgmt : public amd::EmbeddedObject { - public: - DmaFlushMgmt(const Device& dev); - - // Resets DMA command buffer workload - void resetCbWorkload(const Device& dev); - - // Finds split size for the current dispatch - void findSplitSize(const Device& dev, //!< GPU device object - uint64_t threads, //!< Total number of execution threads - uint instructions //!< Number of ALU instructions - ); - - // Returns TRUE if DMA command buffer is ready for a flush - bool isCbReady(VirtualGPU& gpu, //!< Virtual GPU object - uint64_t threads, //!< Total number of execution threads - uint instructions //!< Number of ALU instructions - ); - - // Returns dispatch split size - uint dispatchSplitSize() const { return dispatchSplitSize_; } - - private: - uint64_t maxDispatchWorkload_; //!< Maximum number of operations for a single dispatch - uint64_t maxCbWorkload_; //!< Maximum number of operations for DMA command buffer - uint64_t cbWorkload_; //!< Current number of operations in DMA command buffer - uint aluCnt_; //!< All ALUs on the chip - uint dispatchSplitSize_; //!< Dispath split size in elements - }; - public: VirtualGPU(Device& device); //! Creates virtual gpu object @@ -470,9 +440,6 @@ class VirtualGPU : public device::VirtualDevice { //! Returns hsaQueueMem_ const Memory* hsaQueueMem() const { return hsaQueueMem_; } - //! Returns DMA flush management structure - const DmaFlushMgmt& dmaFlushMgmt() const { return dmaFlushMgmt_; } - //! Returns the HW ring used on this virtual device uint hwRing() const { return hwRing_; } @@ -695,8 +662,6 @@ class VirtualGPU : public device::VirtualDevice { TimeStampCache* tsCache_; //!< TimeStamp cache MemoryDependency memoryDependency_; //!< Memory dependency class - DmaFlushMgmt dmaFlushMgmt_; //!< DMA flush management - std::vector pinnedMems_; //!< Pinned memory list ManagedBuffer managedBuffer_; //!< Managed write buffer diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp index 763d941b4b..9cfe9d3b09 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.hpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp @@ -233,7 +233,7 @@ class NullDevice : public amd::Device { //! Determine if we can use device memory for SVM const bool forceFineGrain(amd::Memory* memory) const { - return !settings().enableCoarseGrainSVM_ || (memory->getContext().devices().size() > 1); + return (memory->getContext().devices().size() > 1); } virtual bool importExtSemaphore(void** extSemahore, const amd::Os::FileDesc& handle, diff --git a/projects/clr/rocclr/device/rocm/rocsettings.cpp b/projects/clr/rocclr/device/rocm/rocsettings.cpp index 71c341ad43..e747c9c2f6 100644 --- a/projects/clr/rocclr/device/rocm/rocsettings.cpp +++ b/projects/clr/rocclr/device/rocm/rocsettings.cpp @@ -36,17 +36,10 @@ Settings::Settings() { doublePrecision_ = ::CL_KHR_FP64; enableLocalMemory_ = HSA_LOCAL_MEMORY_ENABLE; - enableCoarseGrainSVM_ = HSA_ENABLE_COARSE_GRAIN_SVM; maxWorkGroupSize_ = 1024; preferredWorkGroupSize_ = 256; - maxWorkGroupSize2DX_ = 16; - maxWorkGroupSize2DY_ = 16; - maxWorkGroupSize3DX_ = 4; - maxWorkGroupSize3DY_ = 4; - maxWorkGroupSize3DZ_ = 4; - kernargPoolSize_ = HSA_KERNARG_POOL_SIZE; // Determine if user is requesting Non-Coherent mode @@ -96,7 +89,6 @@ Settings::Settings() { cpu_wait_for_signal_ = (!flagIsDefault(ROC_CPU_WAIT_FOR_SIGNAL)) ? ROC_CPU_WAIT_FOR_SIGNAL : cpu_wait_for_signal_; system_scope_signal_ = ROC_SYSTEM_SCOPE_SIGNAL; - skip_copy_sync_ = ROC_SKIP_COPY_SYNC; // Use coarse grain system memory for kernel arguments by default (to keep GPU cache) fgs_kernel_arg_ = false; @@ -201,23 +193,6 @@ void Settings::override() { preferredWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE; } - if (GPU_MAX_WORKGROUP_SIZE_2D_X != 0) { - maxWorkGroupSize2DX_ = GPU_MAX_WORKGROUP_SIZE_2D_X; - } - if (GPU_MAX_WORKGROUP_SIZE_2D_Y != 0) { - maxWorkGroupSize2DY_ = GPU_MAX_WORKGROUP_SIZE_2D_Y; - } - - if (GPU_MAX_WORKGROUP_SIZE_3D_X != 0) { - maxWorkGroupSize3DX_ = GPU_MAX_WORKGROUP_SIZE_3D_X; - } - if (GPU_MAX_WORKGROUP_SIZE_3D_Y != 0) { - maxWorkGroupSize3DY_ = GPU_MAX_WORKGROUP_SIZE_3D_Y; - } - if (GPU_MAX_WORKGROUP_SIZE_3D_Z != 0) { - maxWorkGroupSize3DZ_ = GPU_MAX_WORKGROUP_SIZE_3D_Z; - } - if (!flagIsDefault(GPU_XFER_BUFFER_SIZE)) { xferBufSize_ = GPU_XFER_BUFFER_SIZE * Ki; } diff --git a/projects/clr/rocclr/device/rocm/rocsettings.hpp b/projects/clr/rocclr/device/rocm/rocsettings.hpp index d2fffd73db..4f745f8521 100644 --- a/projects/clr/rocclr/device/rocm/rocsettings.hpp +++ b/projects/clr/rocclr/device/rocm/rocsettings.hpp @@ -43,7 +43,6 @@ class Settings : public device::Settings { struct { uint doublePrecision_ : 1; //!< Enables double precision support uint enableLocalMemory_ : 1; //!< Enable GPUVM memory - uint enableCoarseGrainSVM_ : 1; //!< Enable device memory for coarse grain SVM allocations uint enableNCMode_ : 1; //!< Enable Non Coherent mode for system memory uint imageDMA_ : 1; //!< Enable direct image DMA transfers uint stagedXferRead_ : 1; //!< Uses a staged buffer read @@ -51,11 +50,10 @@ class Settings : public device::Settings { uint imageBufferWar_ : 1; //!< Image buffer workaround for Gfx10 uint cpu_wait_for_signal_ : 1; //!< Wait for HSA signal on CPU uint system_scope_signal_ : 1; //!< HSA signal is visibile to the entire system - uint skip_copy_sync_ : 1; //!< Ignore explicit HSA signal waits for copy functionality uint fgs_kernel_arg_ : 1; //!< Use fine grain kernel arg segment uint coop_sync_ : 1; //!< grid and multi-grid sync for gfx940+ uint barrier_value_packet_ : 1; //!< Barrier value packet functionality - uint reserved_ : 18; + uint reserved_ : 20; }; uint value_; }; @@ -66,15 +64,6 @@ class Settings : public device::Settings { //! Preferred workgroup size uint preferredWorkGroupSize_; - //! Default max workgroup sizes for 2D - int maxWorkGroupSize2DX_; - int maxWorkGroupSize2DY_; - - //! Default max workgroup sizes for 3D - int maxWorkGroupSize3DX_; - int maxWorkGroupSize3DY_; - int maxWorkGroupSize3DZ_; - uint kernargPoolSize_; uint numDeviceEvents_; //!< The number of device events uint numWaitEvents_; //!< The number of wait events for device enqueue diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 129e7466b2..6450dd86ae 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -490,7 +490,7 @@ std::vector& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngi // Check if skip wait optimization is enabled. It will try to predict the same engine in ROCr // and ignore the signal wait, relying on in-order engine execution const Settings& settings = gpu_.dev().settings(); - if (!settings.skip_copy_sync_ && (engine != HwQueueEngine::Compute)) { + if (engine != HwQueueEngine::Compute) { explicit_wait = true; } } diff --git a/projects/clr/rocclr/utils/flags.hpp b/projects/clr/rocclr/utils/flags.hpp index 3baae3e590..809278d1b4 100644 --- a/projects/clr/rocclr/utils/flags.hpp +++ b/projects/clr/rocclr/utils/flags.hpp @@ -30,22 +30,10 @@ release(uint, AMD_LOG_MASK, 0X7FFFFFFF, \ "The mask to enable specific kinds of logs") \ debug(uint, DEBUG_GPU_FLAGS, 0, \ "The debug options for GPU device") \ -release(uint, GPU_MAX_COMMAND_QUEUES, 300, \ - "The maximum number of concurrent Virtual GPUs") \ release(size_t, CQ_THREAD_STACK_SIZE, 256*Ki, /* @todo: that much! */ \ "The default command queue thread stack size") \ release(int, GPU_MAX_WORKGROUP_SIZE, 0, \ "Maximum number of workitems in a workgroup for GPU, 0 -use default") \ -release(int, GPU_MAX_WORKGROUP_SIZE_2D_X, 0, \ - "Maximum number of workitems in a 2D workgroup for GPU, x component, 0 -use default") \ -release(int, GPU_MAX_WORKGROUP_SIZE_2D_Y, 0, \ - "Maximum number of workitems in a 2D workgroup for GPU, y component, 0 -use default") \ -release(int, GPU_MAX_WORKGROUP_SIZE_3D_X, 0, \ - "Maximum number of workitems in a 3D workgroup for GPU, x component, 0 -use default") \ -release(int, GPU_MAX_WORKGROUP_SIZE_3D_Y, 0, \ - "Maximum number of workitems in a 3D workgroup for GPU, y component, 0 -use default") \ -release(int, GPU_MAX_WORKGROUP_SIZE_3D_Z, 0, \ - "Maximum number of workitems in a 3D workgroup for GPU, z component, 0 -use default") \ debug(bool, CPU_MEMORY_GUARD_PAGES, false, \ "Use guard pages for CPU memory") \ debug(size_t, CPU_MEMORY_GUARD_PAGE_SIZE, 64, \ @@ -74,8 +62,6 @@ release(uint, GPU_BLIT_ENGINE_TYPE, 0x0, \ "Blit engine type: 0 - Default, 1 - Host, 2 - CAL, 3 - Kernel") \ release(bool, GPU_FLUSH_ON_EXECUTION, false, \ "Submit commands to HW on every operation. 0 - Disable, 1 - Enable") \ -release(bool, GPU_USE_SYNC_OBJECTS, true, \ - "If enabled, use sync objects instead of polling") \ release(bool, CL_KHR_FP64, true, \ "Enable/Disable support for double precision") \ release(cstring, AMD_OCL_BUILD_OPTIONS, 0, \ @@ -86,12 +72,8 @@ release(cstring, AMD_OCL_LINK_OPTIONS, 0, \ "Set clLinkProgram()'s options (override)") \ release(cstring, AMD_OCL_LINK_OPTIONS_APPEND, 0, \ "Append clLinkProgram()'s options") \ -release(cstring, AMD_OCL_SC_LIB, 0, \ - "Set shader compiler shared library name or path") \ debug(cstring, AMD_OCL_SUBST_OBJFILE, 0, \ "Specify binary substitution config file for OpenCL") \ -debug(bool, AMD_OCL_ENABLE_MESSAGE_BOX, false, \ - "Enable the error dialog on Windows") \ release(size_t, GPU_PINNED_XFER_SIZE, 32, \ "The pinned buffer size for pinning in read/write transfers in MiB") \ release(size_t, GPU_PINNED_MIN_XFER_SIZE, 128, \ @@ -100,12 +82,6 @@ release(size_t, GPU_RESOURCE_CACHE_SIZE, 64, \ "The resource cache size in MB") \ release(size_t, GPU_MAX_SUBALLOC_SIZE, 4096, \ "The maximum size accepted for suballocaitons in KB") \ -release(bool, GPU_FORCE_64BIT_PTR, 0, \ - "Forces 64 bit pointers on GPU") \ -release(bool, GPU_FORCE_OCL20_32BIT, 0, \ - "Forces 32 bit apps to take CLANG\HSAIL path") \ -release(bool, GPU_RAW_TIMESTAMP, 0, \ - "Reports GPU raw timestamps in GPU timeline") \ release(size_t, GPU_NUM_MEM_DEPENDENCY, 256, \ "Number of memory objects for dependency tracking") \ release(size_t, GPU_XFER_BUFFER_SIZE, 0, \ @@ -116,32 +92,20 @@ release(uint, GPU_SINGLE_ALLOC_PERCENT, 85, \ "Maximum size of a single allocation as percentage of total") \ release(uint, GPU_NUM_COMPUTE_RINGS, 2, \ "GPU number of compute rings. 0 - disabled, 1 , 2,.. - the number of compute rings") \ -release(int, GPU_SELECT_COMPUTE_RINGS_ID, -1, \ - "GPU select the compute rings ID -1 - disabled, 0 , 1,.. - the forced compute rings ID for submission") \ -release(uint, GPU_WORKLOAD_SPLIT, 22, \ - "Workload split size") \ -release(bool, GPU_USE_SINGLE_SCRATCH, false, \ - "Use single scratch buffer per device instead of per HW ring") \ release(bool, AMD_OCL_WAIT_COMMAND, false, \ "1 = Enable a wait for every submitted command") \ release(uint, GPU_PRINT_CHILD_KERNEL, 0, \ "Prints the specified number of the child kernels") \ release(bool, GPU_USE_DEVICE_QUEUE, false, \ "Use a dedicated device queue for the actual submissions") \ -release(bool, GPU_ENABLE_LARGE_ALLOCATION, true, \ - "Enable >4GB single allocations") \ release(bool, AMD_THREAD_TRACE_ENABLE, true, \ "Enable thread trace extension") \ -release(uint, OPENCL_VERSION, (IS_BRAHMA ? 120 : 200), \ +release(uint, OPENCL_VERSION, 200, \ "Force GPU opencl verison") \ release(bool, HSA_LOCAL_MEMORY_ENABLE, true, \ "Enable HSA device local memory usage") \ release(uint, HSA_KERNARG_POOL_SIZE, 1024 * 1024, \ "Kernarg pool size") \ -release(bool, HSA_ENABLE_COARSE_GRAIN_SVM, true, \ - "Enable device memory for coarse grain SVM allocations") \ -release(bool, GPU_IFH_MODE, false, \ - "1 = Enable GPU IFH (infinitely fast hardware) mode. Any other value keeps setting disabled.") \ release(bool, GPU_MIPMAP, true, \ "Enables GPU mipmap extension") \ release(uint, GPU_ENABLE_PAL, 2, \ @@ -152,8 +116,6 @@ release(int, AMD_GPU_FORCE_SINGLE_FP_DENORM, -1, \ "Force denorm for single precision: -1 - don't force, 0 - disable, 1 - enable") \ release(uint, OCL_SET_SVM_SIZE, 4*16384, \ "set SVM space size for discrete GPU") \ -debug(uint, OCL_SYSMEM_REQUIREMENT, 2, \ - "Use flag to change the minimum requirement of system memory not to downgrade") \ release(uint, GPU_WAVES_PER_SIMD, 0, \ "Force the number of waves per SIMD (1-10)") \ release(bool, GPU_WAVE_LIMIT_ENABLE, false, \ @@ -176,10 +138,6 @@ release_on_stg(cstring, GPU_WAVE_LIMIT_DUMP, "", \ "File path prefix for dumping wave limiter output") \ release_on_stg(cstring, GPU_WAVE_LIMIT_TRACE, "", \ "File path prefix for tracing wave limiter") \ -release(bool, OCL_CODE_CACHE_ENABLE, false, \ - "1 = Enable compiler code cache") \ -release(bool, OCL_CODE_CACHE_RESET, false, \ - "1 = Reset the compiler code cache storage") \ release(bool, PAL_DISABLE_SDMA, false, \ "1 = Disable SDMA for PAL") \ release(uint, PAL_RGP_DISP_COUNT, 10000, \ @@ -243,10 +201,6 @@ release(bool, ROC_CPU_WAIT_FOR_SIGNAL, true, \ "Enable CPU wait for dependent HSA signals.") \ release(bool, ROC_SYSTEM_SCOPE_SIGNAL, true, \ "Enable system scope for signals (uses interrupts).") \ -release(bool, ROC_SKIP_COPY_SYNC, false, \ - "Skips copy syncs if runtime can predict the same engine.") \ -release(bool, ROC_ENABLE_PRE_VEGA, false, \ - "Enable support of pre-vega ASICs in ROCm path") \ release(bool, GPU_FORCE_QUEUE_PROFILING, false, \ "Force command queue profiling by default") \ release(bool, HIP_MEM_POOL_SUPPORT, false, \ diff --git a/projects/clr/rocclr/utils/macros.hpp b/projects/clr/rocclr/utils/macros.hpp index 02fef7599b..aaf7e7bae0 100644 --- a/projects/clr/rocclr/utils/macros.hpp +++ b/projects/clr/rocclr/utils/macros.hpp @@ -178,12 +178,6 @@ #define ALWAYSINLINE #endif // !_MSC_VER -#ifdef BRAHMA -#define IS_BRAHMA true -#else -#define IS_BRAHMA false -#endif - //! \endcond #endif // MACROS_HPP_