From dfa77900301856bcc5c987fcd26aa70d5f1ca12f Mon Sep 17 00:00:00 2001 From: German Date: Fri, 3 Feb 2023 13:44:24 -0500 Subject: [PATCH] SWDEV-368235 - Remove obsolete env variables Change-Id: I7e14d53297e79e2f68b3a6cc40251ad7db9eb5ab [ROCm/clr commit: 7b50c935f862ae2d24d354fad56c9ae9098bf8d7] --- projects/clr/rocclr/device/device.cpp | 11 -- projects/clr/rocclr/device/device.hpp | 3 +- projects/clr/rocclr/device/devkernel.cpp | 150 ++++++++---------- projects/clr/rocclr/device/pal/paldevice.cpp | 4 +- .../clr/rocclr/device/pal/palsettings.cpp | 39 +---- .../clr/rocclr/device/pal/palsettings.hpp | 4 +- projects/clr/rocclr/device/pal/palvirtual.cpp | 25 +-- projects/clr/rocclr/device/rocm/rocdevice.cpp | 58 ++++--- projects/clr/rocclr/device/rocm/rocdevice.hpp | 2 +- .../clr/rocclr/device/rocm/rocsettings.cpp | 26 --- .../clr/rocclr/device/rocm/rocsettings.hpp | 13 +- projects/clr/rocclr/utils/flags.hpp | 44 ----- 12 files changed, 106 insertions(+), 273 deletions(-) diff --git a/projects/clr/rocclr/device/device.cpp b/projects/clr/rocclr/device/device.cpp index f87452dbd5..288f84f616 100644 --- a/projects/clr/rocclr/device/device.cpp +++ b/projects/clr/rocclr/device/device.cpp @@ -798,17 +798,6 @@ Settings::Settings() : value_(0) { commandQueues_ = 200; //!< Field value set to maximum number //!< concurrent Virtual GPUs for default - overrideLclSet = (!flagIsDefault(GPU_MAX_WORKGROUP_SIZE)) ? 1 : 0; - overrideLclSet |= - (!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) || !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y)) - ? 2 - : 0; - overrideLclSet |= - (!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) || !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) || - !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z)) - ? 4 - : 0; - fenceScopeAgent_ = AMD_OPT_FLUSH; if (amd::IS_HIP) { if (flagIsDefault(GPU_SINGLE_ALLOC_PERCENT)) { diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp index a493795e18..ed206311fb 100644 --- a/projects/clr/rocclr/device/device.hpp +++ b/projects/clr/rocclr/device/device.hpp @@ -619,7 +619,6 @@ class Settings : public amd::HeapObject { uint64_t extensions_; //!< Supported OCL extensions union { struct { - uint overrideLclSet : 3; //!< Bit mask to override the local size uint apuSystem_ : 1; //!< Device is APU system with shared memory uint supportRA_ : 1; //!< Support RA channel order format uint waitCommand_ : 1; //!< Enables a wait for every submitted command @@ -639,7 +638,7 @@ class Settings : public amd::HeapObject { uint enableCoopMultiDeviceGroups_ : 1; //!< Enable cooperative groups multi device uint fenceScopeAgent_ : 1; //!< Enable fence scope agent in AQL dispatch packet uint rocr_backend_ : 1; //!< Device uses ROCr backend for submissions - uint reserved_ : 11; + uint reserved_ : 14; }; uint value_; }; diff --git a/projects/clr/rocclr/device/devkernel.cpp b/projects/clr/rocclr/device/devkernel.cpp index e598dca74f..42eb185741 100644 --- a/projects/clr/rocclr/device/devkernel.cpp +++ b/projects/clr/rocclr/device/devkernel.cpp @@ -677,98 +677,78 @@ void Kernel::FindLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize, if (workGroupInfo()->compileSize_[0] == 0) { // Find the default local workgroup size, if it wasn't specified if (lclWorkSize[0] == 0) { - if ((device().settings().overrideLclSet & (1 << (workDim - 1))) == 0) { - // Find threads per group - size_t thrPerGrp = workGroupInfo()->size_; + // Find threads per group + size_t thrPerGrp = workGroupInfo()->size_; - // Check if kernel uses images - if (flags_.imageEna_ && - // and thread group is a multiple value of wavefronts - ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) && - // and it's 2 or 3-dimensional workload - (workDim > 1) && (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0))) { - // Use 8x8 workgroup size if kernel has image writes - if (flags_.imageWriteEna_ || (thrPerGrp != device().info().preferredWorkGroupSize_)) { - lclWorkSize[0] = 8; - lclWorkSize[1] = 8; - } - else { - lclWorkSize[0] = 16; - lclWorkSize[1] = 16; - } - if (workDim == 3) { - lclWorkSize[2] = 1; - } + // Check if kernel uses images + if (flags_.imageEna_ && + // and thread group is a multiple value of wavefronts + ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) && + // and it's 2 or 3-dimensional workload + (workDim > 1) && (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0))) { + // Use 8x8 workgroup size if kernel has image writes + if (flags_.imageWriteEna_ || (thrPerGrp != device().info().preferredWorkGroupSize_)) { + lclWorkSize[0] = 8; + lclWorkSize[1] = 8; } else { - size_t tmp = thrPerGrp; - // Split the local workgroup into the most efficient way - for (uint d = 0; d < workDim; ++d) { - size_t div = tmp; - for (; (gblWorkSize[d] % div) != 0; div--) - ; - lclWorkSize[d] = div; - tmp /= div; - } - - // Assuming DWORD access - const uint cacheLineMatch = device().info().globalMemCacheLineSize_ >> 2; - - // Check if we couldn't find optimal workload - if (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) || - // or size is too small for the cache line - (lclWorkSize[0] < cacheLineMatch)) { - size_t maxSize = 0; - size_t maxDim = 0; - for (uint d = 0; d < workDim; ++d) { - if (maxSize < gblWorkSize[d]) { - maxSize = gblWorkSize[d]; - maxDim = d; - } - } - // Use X dimension as high priority. Runtime will assume that - // X dimension is more important for the address calculation - if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) { - lclWorkSize[0] = cacheLineMatch; - thrPerGrp /= cacheLineMatch; - lclWorkSize[maxDim] = thrPerGrp; - for (uint d = 1; d < workDim; ++d) { - if (d != maxDim) { - lclWorkSize[d] = 1; - } - } - } - else { - // Check if a local workgroup has the most optimal size - if (thrPerGrp > maxSize) { - thrPerGrp = maxSize; - } - lclWorkSize[maxDim] = thrPerGrp; - for (uint d = 0; d < workDim; ++d) { - if (d != maxDim) { - lclWorkSize[d] = 1; - } - } - } - } + lclWorkSize[0] = 16; + lclWorkSize[1] = 16; + } + if (workDim == 3) { + lclWorkSize[2] = 1; } } else { - // Use overrides when app doesn't provide workgroup dimensions - if (workDim == 1) { - lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE; + size_t tmp = thrPerGrp; + // Split the local workgroup into the most efficient way + for (uint d = 0; d < workDim; ++d) { + size_t div = tmp; + for (; (gblWorkSize[d] % div) != 0; div--) + ; + lclWorkSize[d] = div; + tmp /= div; } - else if (workDim == 2) { - lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X; - lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y; - } - else if (workDim == 3) { - lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X; - lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y; - lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z; - } - else { - assert(0 && "Invalid workDim!"); + + // Assuming DWORD access + const uint cacheLineMatch = device().info().globalMemCacheLineSize_ >> 2; + + // Check if we couldn't find optimal workload + if (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) || + // or size is too small for the cache line + (lclWorkSize[0] < cacheLineMatch)) { + size_t maxSize = 0; + size_t maxDim = 0; + for (uint d = 0; d < workDim; ++d) { + if (maxSize < gblWorkSize[d]) { + maxSize = gblWorkSize[d]; + maxDim = d; + } + } + // Use X dimension as high priority. Runtime will assume that + // X dimension is more important for the address calculation + if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) { + lclWorkSize[0] = cacheLineMatch; + thrPerGrp /= cacheLineMatch; + lclWorkSize[maxDim] = thrPerGrp; + for (uint d = 1; d < workDim; ++d) { + if (d != maxDim) { + lclWorkSize[d] = 1; + } + } + } + else { + // Check if a local workgroup has the most optimal size + if (thrPerGrp > maxSize) { + thrPerGrp = maxSize; + } + lclWorkSize[maxDim] = thrPerGrp; + for (uint d = 0; d < workDim; ++d) { + if (d != maxDim) { + lclWorkSize[d] = 1; + } + } + } } } } diff --git a/projects/clr/rocclr/device/pal/paldevice.cpp b/projects/clr/rocclr/device/pal/paldevice.cpp index 75cd3c3eaa..6c60ef0524 100644 --- a/projects/clr/rocclr/device/pal/paldevice.cpp +++ b/projects/clr/rocclr/device/pal/paldevice.cpp @@ -291,7 +291,7 @@ bool NullDevice::create(const char* palName, const amd::Isa& isa, Pal::GfxIpLeve nullptr, nullptr, nullptr, - AMD_OCL_SC_LIB}; + nullptr}; // Initialize the compiler handle acl_error error; compiler_ = amd::Hsail::CompilerInit(&opts, &error); @@ -1013,7 +1013,7 @@ bool Device::create(Pal::IDevice* device) { nullptr, nullptr, nullptr, - AMD_OCL_SC_LIB}; + nullptr}; // Initialize the compiler handle acl_error error; compiler_ = amd::Hsail::CompilerInit(&opts, &error); diff --git a/projects/clr/rocclr/device/pal/palsettings.cpp b/projects/clr/rocclr/device/pal/palsettings.cpp index 1f4dc8e908..59ac82b8ea 100644 --- a/projects/clr/rocclr/device/pal/palsettings.cpp +++ b/projects/clr/rocclr/device/pal/palsettings.cpp @@ -76,8 +76,6 @@ Settings::Settings() { // Enable workload split by default (for 24 bit arithmetic or timeout) workloadSplitSize_ = 1 << GPU_WORKLOAD_SPLIT; - // By default use host blit - blitEngine_ = BlitEngineHost; pinnedXferSize_ = GPU_PINNED_MIN_XFER_SIZE * Mi; pinnedMinXferSize_ = flagIsDefault(GPU_PINNED_MIN_XFER_SIZE) ? 128 * Mi : GPU_PINNED_MIN_XFER_SIZE * Mi; @@ -123,8 +121,6 @@ Settings::Settings() { //!@note IOL for Linux doesn't setup tiling aperture in CMM/QS linearPersistentImage_ = false; - useSingleScratch_ = GPU_USE_SINGLE_SCRATCH; - // Device enqueuing settings numDeviceEvents_ = 1024; numWaitEvents_ = 8; @@ -328,16 +324,11 @@ bool Settings::create(const Pal::DeviceProperties& palProp, libSelector_ = amd::GPU_Library_CI; if (LP64_SWITCH(false, true)) { - oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/ - ? XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR)) - : OpenCL12; - } - if (GPU_FORCE_OCL20_32BIT) { - force32BitOcl20_ = true; - oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/ + oclVersion_ = !reportAsOCL12Device ? XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR)) : OpenCL12; } + if (OPENCL_VERSION < 200) { oclVersion_ = OpenCL12; } @@ -346,27 +337,13 @@ bool Settings::create(const Pal::DeviceProperties& palProp, // Cap at OpenCL20 for now if (oclVersion_ > OpenCL20) oclVersion_ = OpenCL20; - // This needs to be cleaned once 64bit addressing is stable - if (oclVersion_ < OpenCL20) { - use64BitPtr_ = flagIsDefault(GPU_FORCE_64BIT_PTR) - ? LP64_SWITCH(false, - /*calAttr.isWorkstation ||*/ true) - : GPU_FORCE_64BIT_PTR; - } else { - if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, true)) { - use64BitPtr_ = true; - } - } + use64BitPtr_ = LP64_SWITCH(false, true); if (oclVersion_ >= OpenCL20) { supportDepthsRGB_ = true; } if (use64BitPtr_) { - if (GPU_ENABLE_LARGE_ALLOCATION) { - maxAllocSize_ = 64ULL * Gi; - } else { - maxAllocSize_ = 4048 * Mi; - } + maxAllocSize_ = 64ULL * Gi; } else { maxAllocSize_ = 3ULL * Gi; } @@ -447,9 +424,6 @@ bool Settings::create(const Pal::DeviceProperties& palProp, imageSupport_ = true; - // Use kernels for blit if appropriate - blitEngine_ = BlitEngineKernel; - hostMemDirectAccess_ |= HostMemBuffer; // HW doesn't support untiled image writes // hostMemDirectAccess_ |= HostMemImage; @@ -542,11 +516,6 @@ void Settings::override() { preferredWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE; } - // Override blit engine type - if (GPU_BLIT_ENGINE_TYPE != BlitEngineDefault) { - blitEngine_ = GPU_BLIT_ENGINE_TYPE; - } - if (!flagIsDefault(DEBUG_GPU_FLAGS)) { debugFlags_ = DEBUG_GPU_FLAGS; } diff --git a/projects/clr/rocclr/device/pal/palsettings.hpp b/projects/clr/rocclr/device/pal/palsettings.hpp index 66984622ee..4bcc7e2e9b 100644 --- a/projects/clr/rocclr/device/pal/palsettings.hpp +++ b/projects/clr/rocclr/device/pal/palsettings.hpp @@ -70,7 +70,6 @@ class Settings : public device::Settings { uint gfx10Plus_ : 1; //!< gfx10 and post gfx10 features uint threadTraceEnable_ : 1; //!< Thread trace enable uint linearPersistentImage_ : 1; //!< Allocates linear images in persistent - uint useSingleScratch_ : 1; //!< Allocates single scratch per device uint svmAtomics_ : 1; //!< SVM device atomics uint svmFineGrainSystem_ : 1; //!< SVM fine grain system support uint useDeviceQueue_ : 1; //!< Submit to separate device queue @@ -82,7 +81,7 @@ class Settings : public device::Settings { uint imageBufferWar_ : 1; //!< Image buffer workaround for Gfx10 uint disableSdma_ : 1; //!< Disable SDMA support uint alwaysResident_ : 1; //!< Make resources resident at allocation time - uint reserved_ : 7; + uint reserved_ : 8; }; uint value_; }; @@ -95,7 +94,6 @@ class Settings : public device::Settings { uint workloadSplitSize_; //!< Workload split size uint minWorkloadTime_; //!< Minimal workload time in 0.1 ms uint maxWorkloadTime_; //!< Maximum workload time in 0.1 ms - uint blitEngine_; //!< Blit engine type uint cacheLineSize_; //!< Cache line size in bytes uint cacheSize_; //!< L1 cache size in bytes uint numComputeRings_; //!< 0 - disabled, 1 , 2,.. - the number of compute rings diff --git a/projects/clr/rocclr/device/pal/palvirtual.cpp b/projects/clr/rocclr/device/pal/palvirtual.cpp index 624379d104..3a2081712a 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/device/pal/palvirtual.cpp @@ -905,11 +905,6 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs, // because destructor calls eraseResourceList() even if create() failed dev().resizeResoureList(index()); - if (index() >= GPU_MAX_COMMAND_QUEUES) { - // Cap the maximum number of concurrent Virtual GPUs - return false; - } - // Virtual GPU will have profiling enabled state_.profiling_ = profiling; @@ -1020,18 +1015,7 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs, return false; } - // Choose the appropriate class for blit engine - switch (dev().settings().blitEngine_) { - default: - // Fall through ... - case Settings::BlitEngineHost: - blitSetup.disableAll(); - // Fall through ... - case Settings::BlitEngineCAL: - case Settings::BlitEngineKernel: - blitMgr_ = new KernelBlitManager(*this, blitSetup); - break; - } + blitMgr_ = new KernelBlitManager(*this, blitSetup); if ((nullptr == blitMgr_) || !blitMgr_->create(gpuDevice_)) { LogError("Could not create BlitManager!"); return false; @@ -3269,11 +3253,8 @@ void VirtualGPU::waitEventLock(CommandBatch* cb) { cb->lastTS_->value(&startTimeStampGPU, &endTimeStampGPU); uint64_t endTimeStampCPU = amd::Os::timeNanos(); - // Make sure the command batch has a valid GPU TS - if (!GPU_RAW_TIMESTAMP) { - // Adjust the base time by the execution time - readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU; - } + // Adjust the base time by the execution time + readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU; } } } diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp index 01c038c9d6..e63ab77eb0 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp @@ -894,37 +894,35 @@ hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, vo Device* dev = reinterpret_cast(data); switch (segment_type) { case HSA_REGION_SEGMENT_GLOBAL: { - if (dev->settings().enableLocalMemory_) { - uint32_t global_flag = 0; - hsa_status_t stat = - hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag); - if (stat != HSA_STATUS_SUCCESS) { - return stat; + uint32_t global_flag = 0; + hsa_status_t stat = + hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag); + if (stat != HSA_STATUS_SUCCESS) { + return stat; + } + + if ((global_flag & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) != 0) { + dev->gpu_fine_grained_segment_ = pool; + } else if ((global_flag & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) != 0) { + dev->gpuvm_segment_ = pool; + + // If cpu agent cannot access this pool, the device does not support large bar. + hsa_amd_memory_pool_access_t tmp{}; + hsa_amd_agent_memory_pool_get_info( + dev->cpu_agent_, + pool, + HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, + &tmp); + + if (tmp == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) { + dev->info_.largeBar_ = false; + } else { + dev->info_.largeBar_ = ROC_ENABLE_LARGE_BAR; } + } - if ((global_flag & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) != 0) { - dev->gpu_fine_grained_segment_ = pool; - } else if ((global_flag & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) != 0) { - dev->gpuvm_segment_ = pool; - - // If cpu agent cannot access this pool, the device does not support large bar. - hsa_amd_memory_pool_access_t tmp{}; - hsa_amd_agent_memory_pool_get_info( - dev->cpu_agent_, - pool, - HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, - &tmp); - - if (tmp == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) { - dev->info_.largeBar_ = false; - } else { - dev->info_.largeBar_ = ROC_ENABLE_LARGE_BAR; - } - } - - if (dev->gpuvm_segment_.handle == 0) { - dev->gpuvm_segment_ = pool; - } + if (dev->gpuvm_segment_.handle == 0) { + dev->gpuvm_segment_ = pool; } break; } @@ -1232,7 +1230,7 @@ bool Device::populateOCLDeviceConstants() { info_.maxWorkItemDimensions_ = 3; - if (settings().enableLocalMemory_ && gpuvm_segment_.handle != 0) { + if (gpuvm_segment_.handle != 0) { size_t global_segment_size = 0; if (HSA_STATUS_SUCCESS != hsa_amd_memory_pool_get_info(gpuvm_segment_, HSA_AMD_MEMORY_POOL_INFO_SIZE, diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp index 3e254ff987..99d0976e13 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.hpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp @@ -218,7 +218,7 @@ class NullDevice : public amd::Device { //! Determine if we can use device memory for SVM const bool forceFineGrain(amd::Memory* memory) const { - return !settings().enableCoarseGrainSVM_ || (memory->getContext().devices().size() > 1); + return (memory->getContext().devices().size() > 1); } virtual bool importExtSemaphore(void** extSemahore, const amd::Os::FileDesc& handle) { diff --git a/projects/clr/rocclr/device/rocm/rocsettings.cpp b/projects/clr/rocclr/device/rocm/rocsettings.cpp index 71c341ad43..24cb88dcbd 100644 --- a/projects/clr/rocclr/device/rocm/rocsettings.cpp +++ b/projects/clr/rocclr/device/rocm/rocsettings.cpp @@ -35,18 +35,9 @@ Settings::Settings() { // Set this to true when we drop the flag doublePrecision_ = ::CL_KHR_FP64; - enableLocalMemory_ = HSA_LOCAL_MEMORY_ENABLE; - enableCoarseGrainSVM_ = HSA_ENABLE_COARSE_GRAIN_SVM; - maxWorkGroupSize_ = 1024; preferredWorkGroupSize_ = 256; - maxWorkGroupSize2DX_ = 16; - maxWorkGroupSize2DY_ = 16; - maxWorkGroupSize3DX_ = 4; - maxWorkGroupSize3DY_ = 4; - maxWorkGroupSize3DZ_ = 4; - kernargPoolSize_ = HSA_KERNARG_POOL_SIZE; // Determine if user is requesting Non-Coherent mode @@ -201,23 +192,6 @@ void Settings::override() { preferredWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE; } - if (GPU_MAX_WORKGROUP_SIZE_2D_X != 0) { - maxWorkGroupSize2DX_ = GPU_MAX_WORKGROUP_SIZE_2D_X; - } - if (GPU_MAX_WORKGROUP_SIZE_2D_Y != 0) { - maxWorkGroupSize2DY_ = GPU_MAX_WORKGROUP_SIZE_2D_Y; - } - - if (GPU_MAX_WORKGROUP_SIZE_3D_X != 0) { - maxWorkGroupSize3DX_ = GPU_MAX_WORKGROUP_SIZE_3D_X; - } - if (GPU_MAX_WORKGROUP_SIZE_3D_Y != 0) { - maxWorkGroupSize3DY_ = GPU_MAX_WORKGROUP_SIZE_3D_Y; - } - if (GPU_MAX_WORKGROUP_SIZE_3D_Z != 0) { - maxWorkGroupSize3DZ_ = GPU_MAX_WORKGROUP_SIZE_3D_Z; - } - if (!flagIsDefault(GPU_XFER_BUFFER_SIZE)) { xferBufSize_ = GPU_XFER_BUFFER_SIZE * Ki; } diff --git a/projects/clr/rocclr/device/rocm/rocsettings.hpp b/projects/clr/rocclr/device/rocm/rocsettings.hpp index d2fffd73db..5b5f81d7d7 100644 --- a/projects/clr/rocclr/device/rocm/rocsettings.hpp +++ b/projects/clr/rocclr/device/rocm/rocsettings.hpp @@ -42,8 +42,6 @@ class Settings : public device::Settings { union { struct { uint doublePrecision_ : 1; //!< Enables double precision support - uint enableLocalMemory_ : 1; //!< Enable GPUVM memory - uint enableCoarseGrainSVM_ : 1; //!< Enable device memory for coarse grain SVM allocations uint enableNCMode_ : 1; //!< Enable Non Coherent mode for system memory uint imageDMA_ : 1; //!< Enable direct image DMA transfers uint stagedXferRead_ : 1; //!< Uses a staged buffer read @@ -55,7 +53,7 @@ class Settings : public device::Settings { uint fgs_kernel_arg_ : 1; //!< Use fine grain kernel arg segment uint coop_sync_ : 1; //!< grid and multi-grid sync for gfx940+ uint barrier_value_packet_ : 1; //!< Barrier value packet functionality - uint reserved_ : 18; + uint reserved_ : 20; }; uint value_; }; @@ -66,15 +64,6 @@ class Settings : public device::Settings { //! Preferred workgroup size uint preferredWorkGroupSize_; - //! Default max workgroup sizes for 2D - int maxWorkGroupSize2DX_; - int maxWorkGroupSize2DY_; - - //! Default max workgroup sizes for 3D - int maxWorkGroupSize3DX_; - int maxWorkGroupSize3DY_; - int maxWorkGroupSize3DZ_; - uint kernargPoolSize_; uint numDeviceEvents_; //!< The number of device events uint numWaitEvents_; //!< The number of wait events for device enqueue diff --git a/projects/clr/rocclr/utils/flags.hpp b/projects/clr/rocclr/utils/flags.hpp index 186d059b41..f563c6d3bc 100644 --- a/projects/clr/rocclr/utils/flags.hpp +++ b/projects/clr/rocclr/utils/flags.hpp @@ -30,22 +30,10 @@ release(uint, AMD_LOG_MASK, 0X7FFFFFFF, \ "The mask to enable specific kinds of logs") \ debug(uint, DEBUG_GPU_FLAGS, 0, \ "The debug options for GPU device") \ -release(uint, GPU_MAX_COMMAND_QUEUES, 300, \ - "The maximum number of concurrent Virtual GPUs") \ release(size_t, CQ_THREAD_STACK_SIZE, 256*Ki, /* @todo: that much! */ \ "The default command queue thread stack size") \ release(int, GPU_MAX_WORKGROUP_SIZE, 0, \ "Maximum number of workitems in a workgroup for GPU, 0 -use default") \ -release(int, GPU_MAX_WORKGROUP_SIZE_2D_X, 0, \ - "Maximum number of workitems in a 2D workgroup for GPU, x component, 0 -use default") \ -release(int, GPU_MAX_WORKGROUP_SIZE_2D_Y, 0, \ - "Maximum number of workitems in a 2D workgroup for GPU, y component, 0 -use default") \ -release(int, GPU_MAX_WORKGROUP_SIZE_3D_X, 0, \ - "Maximum number of workitems in a 3D workgroup for GPU, x component, 0 -use default") \ -release(int, GPU_MAX_WORKGROUP_SIZE_3D_Y, 0, \ - "Maximum number of workitems in a 3D workgroup for GPU, y component, 0 -use default") \ -release(int, GPU_MAX_WORKGROUP_SIZE_3D_Z, 0, \ - "Maximum number of workitems in a 3D workgroup for GPU, z component, 0 -use default") \ debug(bool, CPU_MEMORY_GUARD_PAGES, false, \ "Use guard pages for CPU memory") \ debug(size_t, CPU_MEMORY_GUARD_PAGE_SIZE, 64, \ @@ -70,12 +58,8 @@ release(uint, GPU_STAGING_BUFFER_SIZE, 4, \ "Size of the GPU staging buffer in MiB") \ release(bool, GPU_DUMP_BLIT_KERNELS, false, \ "Dump the kernels for blit manager") \ -release(uint, GPU_BLIT_ENGINE_TYPE, 0x0, \ - "Blit engine type: 0 - Default, 1 - Host, 2 - CAL, 3 - Kernel") \ release(bool, GPU_FLUSH_ON_EXECUTION, false, \ "Submit commands to HW on every operation. 0 - Disable, 1 - Enable") \ -release(bool, GPU_USE_SYNC_OBJECTS, true, \ - "If enabled, use sync objects instead of polling") \ release(bool, CL_KHR_FP64, true, \ "Enable/Disable support for double precision") \ release(cstring, AMD_OCL_BUILD_OPTIONS, 0, \ @@ -86,12 +70,8 @@ release(cstring, AMD_OCL_LINK_OPTIONS, 0, \ "Set clLinkProgram()'s options (override)") \ release(cstring, AMD_OCL_LINK_OPTIONS_APPEND, 0, \ "Append clLinkProgram()'s options") \ -release(cstring, AMD_OCL_SC_LIB, 0, \ - "Set shader compiler shared library name or path") \ debug(cstring, AMD_OCL_SUBST_OBJFILE, 0, \ "Specify binary substitution config file for OpenCL") \ -debug(bool, AMD_OCL_ENABLE_MESSAGE_BOX, false, \ - "Enable the error dialog on Windows") \ release(size_t, GPU_PINNED_XFER_SIZE, 32, \ "The pinned buffer size for pinning in read/write transfers in MiB") \ release(size_t, GPU_PINNED_MIN_XFER_SIZE, 128, \ @@ -100,12 +80,6 @@ release(size_t, GPU_RESOURCE_CACHE_SIZE, 64, \ "The resource cache size in MB") \ release(size_t, GPU_MAX_SUBALLOC_SIZE, 4096, \ "The maximum size accepted for suballocaitons in KB") \ -release(bool, GPU_FORCE_64BIT_PTR, 0, \ - "Forces 64 bit pointers on GPU") \ -release(bool, GPU_FORCE_OCL20_32BIT, 0, \ - "Forces 32 bit apps to take CLANG\HSAIL path") \ -release(bool, GPU_RAW_TIMESTAMP, 0, \ - "Reports GPU raw timestamps in GPU timeline") \ release(size_t, GPU_NUM_MEM_DEPENDENCY, 256, \ "Number of memory objects for dependency tracking") \ release(size_t, GPU_XFER_BUFFER_SIZE, 0, \ @@ -116,32 +90,20 @@ release(uint, GPU_SINGLE_ALLOC_PERCENT, 85, \ "Maximum size of a single allocation as percentage of total") \ release(uint, GPU_NUM_COMPUTE_RINGS, 2, \ "GPU number of compute rings. 0 - disabled, 1 , 2,.. - the number of compute rings") \ -release(int, GPU_SELECT_COMPUTE_RINGS_ID, -1, \ - "GPU select the compute rings ID -1 - disabled, 0 , 1,.. - the forced compute rings ID for submission") \ release(uint, GPU_WORKLOAD_SPLIT, 22, \ "Workload split size") \ -release(bool, GPU_USE_SINGLE_SCRATCH, false, \ - "Use single scratch buffer per device instead of per HW ring") \ release(bool, AMD_OCL_WAIT_COMMAND, false, \ "1 = Enable a wait for every submitted command") \ release(uint, GPU_PRINT_CHILD_KERNEL, 0, \ "Prints the specified number of the child kernels") \ release(bool, GPU_USE_DEVICE_QUEUE, false, \ "Use a dedicated device queue for the actual submissions") \ -release(bool, GPU_ENABLE_LARGE_ALLOCATION, true, \ - "Enable >4GB single allocations") \ release(bool, AMD_THREAD_TRACE_ENABLE, true, \ "Enable thread trace extension") \ release(uint, OPENCL_VERSION, (IS_BRAHMA ? 120 : 200), \ "Force GPU opencl verison") \ -release(bool, HSA_LOCAL_MEMORY_ENABLE, true, \ - "Enable HSA device local memory usage") \ release(uint, HSA_KERNARG_POOL_SIZE, 1024 * 1024, \ "Kernarg pool size") \ -release(bool, HSA_ENABLE_COARSE_GRAIN_SVM, true, \ - "Enable device memory for coarse grain SVM allocations") \ -release(bool, GPU_IFH_MODE, false, \ - "1 = Enable GPU IFH (infinitely fast hardware) mode. Any other value keeps setting disabled.") \ release(bool, GPU_MIPMAP, true, \ "Enables GPU mipmap extension") \ release(uint, GPU_ENABLE_PAL, 2, \ @@ -152,8 +114,6 @@ release(int, AMD_GPU_FORCE_SINGLE_FP_DENORM, -1, \ "Force denorm for single precision: -1 - don't force, 0 - disable, 1 - enable") \ release(uint, OCL_SET_SVM_SIZE, 4*16384, \ "set SVM space size for discrete GPU") \ -debug(uint, OCL_SYSMEM_REQUIREMENT, 2, \ - "Use flag to change the minimum requirement of system memory not to downgrade") \ release(uint, GPU_WAVES_PER_SIMD, 0, \ "Force the number of waves per SIMD (1-10)") \ release(bool, GPU_WAVE_LIMIT_ENABLE, false, \ @@ -176,10 +136,6 @@ release_on_stg(cstring, GPU_WAVE_LIMIT_DUMP, "", \ "File path prefix for dumping wave limiter output") \ release_on_stg(cstring, GPU_WAVE_LIMIT_TRACE, "", \ "File path prefix for tracing wave limiter") \ -release(bool, OCL_CODE_CACHE_ENABLE, false, \ - "1 = Enable compiler code cache") \ -release(bool, OCL_CODE_CACHE_RESET, false, \ - "1 = Reset the compiler code cache storage") \ release(bool, PAL_DISABLE_SDMA, false, \ "1 = Disable SDMA for PAL") \ release(uint, PAL_RGP_DISP_COUNT, 10000, \