From d7fdd9fcb8f96ebefa7961da492ae64e59867648 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Wed, 15 Feb 2023 07:23:23 +0000 Subject: [PATCH] SWDEV-368235 - Revert "Remove obsolete env variables" This reverts commit dfa77900301856bcc5c987fcd26aa70d5f1ca12f. Reason for revert: Deferred to a future release. Change-Id: Ia66c37f0ab9734dee73c930d10d7469d5fd57254 [ROCm/clr commit: 5dc104b3ea98b6cd9c6b4a227de696a120b7ceee] --- projects/clr/rocclr/device/device.cpp | 11 ++ projects/clr/rocclr/device/device.hpp | 3 +- projects/clr/rocclr/device/devkernel.cpp | 150 ++++++++++-------- projects/clr/rocclr/device/pal/paldevice.cpp | 4 +- .../clr/rocclr/device/pal/palsettings.cpp | 39 ++++- .../clr/rocclr/device/pal/palsettings.hpp | 4 +- projects/clr/rocclr/device/pal/palvirtual.cpp | 25 ++- projects/clr/rocclr/device/rocm/rocdevice.cpp | 58 +++---- projects/clr/rocclr/device/rocm/rocdevice.hpp | 2 +- .../clr/rocclr/device/rocm/rocsettings.cpp | 26 +++ .../clr/rocclr/device/rocm/rocsettings.hpp | 13 +- projects/clr/rocclr/utils/flags.hpp | 44 +++++ 12 files changed, 273 insertions(+), 106 deletions(-) diff --git a/projects/clr/rocclr/device/device.cpp b/projects/clr/rocclr/device/device.cpp index 288f84f616..f87452dbd5 100644 --- a/projects/clr/rocclr/device/device.cpp +++ b/projects/clr/rocclr/device/device.cpp @@ -798,6 +798,17 @@ Settings::Settings() : value_(0) { commandQueues_ = 200; //!< Field value set to maximum number //!< concurrent Virtual GPUs for default + overrideLclSet = (!flagIsDefault(GPU_MAX_WORKGROUP_SIZE)) ? 1 : 0; + overrideLclSet |= + (!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) || !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y)) + ? 2 + : 0; + overrideLclSet |= + (!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) || !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) || + !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z)) + ? 4 + : 0; + fenceScopeAgent_ = AMD_OPT_FLUSH; if (amd::IS_HIP) { if (flagIsDefault(GPU_SINGLE_ALLOC_PERCENT)) { diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp index ed206311fb..a493795e18 100644 --- a/projects/clr/rocclr/device/device.hpp +++ b/projects/clr/rocclr/device/device.hpp @@ -619,6 +619,7 @@ class Settings : public amd::HeapObject { uint64_t extensions_; //!< Supported OCL extensions union { struct { + uint overrideLclSet : 3; //!< Bit mask to override the local size uint apuSystem_ : 1; //!< Device is APU system with shared memory uint supportRA_ : 1; //!< Support RA channel order format uint waitCommand_ : 1; //!< Enables a wait for every submitted command @@ -638,7 +639,7 @@ class Settings : public amd::HeapObject { uint enableCoopMultiDeviceGroups_ : 1; //!< Enable cooperative groups multi device uint fenceScopeAgent_ : 1; //!< Enable fence scope agent in AQL dispatch packet uint rocr_backend_ : 1; //!< Device uses ROCr backend for submissions - uint reserved_ : 14; + uint reserved_ : 11; }; uint value_; }; diff --git a/projects/clr/rocclr/device/devkernel.cpp b/projects/clr/rocclr/device/devkernel.cpp index 42eb185741..e598dca74f 100644 --- a/projects/clr/rocclr/device/devkernel.cpp +++ b/projects/clr/rocclr/device/devkernel.cpp @@ -677,78 +677,98 @@ void Kernel::FindLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize, if (workGroupInfo()->compileSize_[0] == 0) { // Find the default local workgroup size, if it wasn't specified if (lclWorkSize[0] == 0) { - // Find threads per group - size_t thrPerGrp = workGroupInfo()->size_; + if ((device().settings().overrideLclSet & (1 << (workDim - 1))) == 0) { + // Find threads per group + size_t thrPerGrp = workGroupInfo()->size_; - // Check if kernel uses images - if (flags_.imageEna_ && - // and thread group is a multiple value of wavefronts - ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) && - // and it's 2 or 3-dimensional workload - (workDim > 1) && (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0))) { - // Use 8x8 workgroup size if kernel has image writes - if (flags_.imageWriteEna_ || (thrPerGrp != device().info().preferredWorkGroupSize_)) { - lclWorkSize[0] = 8; - lclWorkSize[1] = 8; + // Check if kernel uses images + if (flags_.imageEna_ && + // and thread group is a multiple value of wavefronts + ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) && + // and it's 2 or 3-dimensional workload + (workDim > 1) && (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0))) { + // Use 8x8 workgroup size if kernel has image writes + if (flags_.imageWriteEna_ || (thrPerGrp != device().info().preferredWorkGroupSize_)) { + lclWorkSize[0] = 8; + lclWorkSize[1] = 8; + } + else { + lclWorkSize[0] = 16; + lclWorkSize[1] = 16; + } + if (workDim == 3) { + lclWorkSize[2] = 1; + } } else { - lclWorkSize[0] = 16; - lclWorkSize[1] = 16; - } - if (workDim == 3) { - lclWorkSize[2] = 1; + size_t tmp = thrPerGrp; + // Split the local workgroup into the most efficient way + for (uint d = 0; d < workDim; ++d) { + size_t div = tmp; + for (; (gblWorkSize[d] % div) != 0; div--) + ; + lclWorkSize[d] = div; + tmp /= div; + } + + // Assuming DWORD access + const uint cacheLineMatch = device().info().globalMemCacheLineSize_ >> 2; + + // Check if we couldn't find optimal workload + if (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) || + // or size is too small for the cache line + (lclWorkSize[0] < cacheLineMatch)) { + size_t maxSize = 0; + size_t maxDim = 0; + for (uint d = 0; d < workDim; ++d) { + if (maxSize < gblWorkSize[d]) { + maxSize = gblWorkSize[d]; + maxDim = d; + } + } + // Use X dimension as high priority. Runtime will assume that + // X dimension is more important for the address calculation + if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) { + lclWorkSize[0] = cacheLineMatch; + thrPerGrp /= cacheLineMatch; + lclWorkSize[maxDim] = thrPerGrp; + for (uint d = 1; d < workDim; ++d) { + if (d != maxDim) { + lclWorkSize[d] = 1; + } + } + } + else { + // Check if a local workgroup has the most optimal size + if (thrPerGrp > maxSize) { + thrPerGrp = maxSize; + } + lclWorkSize[maxDim] = thrPerGrp; + for (uint d = 0; d < workDim; ++d) { + if (d != maxDim) { + lclWorkSize[d] = 1; + } + } + } + } } } else { - size_t tmp = thrPerGrp; - // Split the local workgroup into the most efficient way - for (uint d = 0; d < workDim; ++d) { - size_t div = tmp; - for (; (gblWorkSize[d] % div) != 0; div--) - ; - lclWorkSize[d] = div; - tmp /= div; + // Use overrides when app doesn't provide workgroup dimensions + if (workDim == 1) { + lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE; } - - // Assuming DWORD access - const uint cacheLineMatch = device().info().globalMemCacheLineSize_ >> 2; - - // Check if we couldn't find optimal workload - if (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) || - // or size is too small for the cache line - (lclWorkSize[0] < cacheLineMatch)) { - size_t maxSize = 0; - size_t maxDim = 0; - for (uint d = 0; d < workDim; ++d) { - if (maxSize < gblWorkSize[d]) { - maxSize = gblWorkSize[d]; - maxDim = d; - } - } - // Use X dimension as high priority. Runtime will assume that - // X dimension is more important for the address calculation - if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) { - lclWorkSize[0] = cacheLineMatch; - thrPerGrp /= cacheLineMatch; - lclWorkSize[maxDim] = thrPerGrp; - for (uint d = 1; d < workDim; ++d) { - if (d != maxDim) { - lclWorkSize[d] = 1; - } - } - } - else { - // Check if a local workgroup has the most optimal size - if (thrPerGrp > maxSize) { - thrPerGrp = maxSize; - } - lclWorkSize[maxDim] = thrPerGrp; - for (uint d = 0; d < workDim; ++d) { - if (d != maxDim) { - lclWorkSize[d] = 1; - } - } - } + else if (workDim == 2) { + lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X; + lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y; + } + else if (workDim == 3) { + lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X; + lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y; + lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z; + } + else { + assert(0 && "Invalid workDim!"); } } } diff --git a/projects/clr/rocclr/device/pal/paldevice.cpp b/projects/clr/rocclr/device/pal/paldevice.cpp index 5755f34a81..8b07d3914c 100644 --- a/projects/clr/rocclr/device/pal/paldevice.cpp +++ b/projects/clr/rocclr/device/pal/paldevice.cpp @@ -291,7 +291,7 @@ bool NullDevice::create(const char* palName, const amd::Isa& isa, Pal::GfxIpLeve nullptr, nullptr, nullptr, - nullptr}; + AMD_OCL_SC_LIB}; // Initialize the compiler handle acl_error error; compiler_ = amd::Hsail::CompilerInit(&opts, &error); @@ -1013,7 +1013,7 @@ bool Device::create(Pal::IDevice* device) { nullptr, nullptr, nullptr, - nullptr}; + AMD_OCL_SC_LIB}; // Initialize the compiler handle acl_error error; compiler_ = amd::Hsail::CompilerInit(&opts, &error); diff --git a/projects/clr/rocclr/device/pal/palsettings.cpp b/projects/clr/rocclr/device/pal/palsettings.cpp index 59ac82b8ea..1f4dc8e908 100644 --- a/projects/clr/rocclr/device/pal/palsettings.cpp +++ b/projects/clr/rocclr/device/pal/palsettings.cpp @@ -76,6 +76,8 @@ Settings::Settings() { // Enable workload split by default (for 24 bit arithmetic or timeout) workloadSplitSize_ = 1 << GPU_WORKLOAD_SPLIT; + // By default use host blit + blitEngine_ = BlitEngineHost; pinnedXferSize_ = GPU_PINNED_MIN_XFER_SIZE * Mi; pinnedMinXferSize_ = flagIsDefault(GPU_PINNED_MIN_XFER_SIZE) ? 128 * Mi : GPU_PINNED_MIN_XFER_SIZE * Mi; @@ -121,6 +123,8 @@ Settings::Settings() { //!@note IOL for Linux doesn't setup tiling aperture in CMM/QS linearPersistentImage_ = false; + useSingleScratch_ = GPU_USE_SINGLE_SCRATCH; + // Device enqueuing settings numDeviceEvents_ = 1024; numWaitEvents_ = 8; @@ -324,11 +328,16 @@ bool Settings::create(const Pal::DeviceProperties& palProp, libSelector_ = amd::GPU_Library_CI; if (LP64_SWITCH(false, true)) { - oclVersion_ = !reportAsOCL12Device + oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/ + ? XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR)) + : OpenCL12; + } + if (GPU_FORCE_OCL20_32BIT) { + force32BitOcl20_ = true; + oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/ ? XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR)) : OpenCL12; } - if (OPENCL_VERSION < 200) { oclVersion_ = OpenCL12; } @@ -337,13 +346,27 @@ bool Settings::create(const Pal::DeviceProperties& palProp, // Cap at OpenCL20 for now if (oclVersion_ > OpenCL20) oclVersion_ = OpenCL20; - use64BitPtr_ = LP64_SWITCH(false, true); + // This needs to be cleaned once 64bit addressing is stable + if (oclVersion_ < OpenCL20) { + use64BitPtr_ = flagIsDefault(GPU_FORCE_64BIT_PTR) + ? LP64_SWITCH(false, + /*calAttr.isWorkstation ||*/ true) + : GPU_FORCE_64BIT_PTR; + } else { + if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, true)) { + use64BitPtr_ = true; + } + } if (oclVersion_ >= OpenCL20) { supportDepthsRGB_ = true; } if (use64BitPtr_) { - maxAllocSize_ = 64ULL * Gi; + if (GPU_ENABLE_LARGE_ALLOCATION) { + maxAllocSize_ = 64ULL * Gi; + } else { + maxAllocSize_ = 4048 * Mi; + } } else { maxAllocSize_ = 3ULL * Gi; } @@ -424,6 +447,9 @@ bool Settings::create(const Pal::DeviceProperties& palProp, imageSupport_ = true; + // Use kernels for blit if appropriate + blitEngine_ = BlitEngineKernel; + hostMemDirectAccess_ |= HostMemBuffer; // HW doesn't support untiled image writes // hostMemDirectAccess_ |= HostMemImage; @@ -516,6 +542,11 @@ void Settings::override() { preferredWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE; } + // Override blit engine type + if (GPU_BLIT_ENGINE_TYPE != BlitEngineDefault) { + blitEngine_ = GPU_BLIT_ENGINE_TYPE; + } + if (!flagIsDefault(DEBUG_GPU_FLAGS)) { debugFlags_ = DEBUG_GPU_FLAGS; } diff --git a/projects/clr/rocclr/device/pal/palsettings.hpp b/projects/clr/rocclr/device/pal/palsettings.hpp index 4bcc7e2e9b..66984622ee 100644 --- a/projects/clr/rocclr/device/pal/palsettings.hpp +++ b/projects/clr/rocclr/device/pal/palsettings.hpp @@ -70,6 +70,7 @@ class Settings : public device::Settings { uint gfx10Plus_ : 1; //!< gfx10 and post gfx10 features uint threadTraceEnable_ : 1; //!< Thread trace enable uint linearPersistentImage_ : 1; //!< Allocates linear images in persistent + uint useSingleScratch_ : 1; //!< Allocates single scratch per device uint svmAtomics_ : 1; //!< SVM device atomics uint svmFineGrainSystem_ : 1; //!< SVM fine grain system support uint useDeviceQueue_ : 1; //!< Submit to separate device queue @@ -81,7 +82,7 @@ class Settings : public device::Settings { uint imageBufferWar_ : 1; //!< Image buffer workaround for Gfx10 uint disableSdma_ : 1; //!< Disable SDMA support uint alwaysResident_ : 1; //!< Make resources resident at allocation time - uint reserved_ : 8; + uint reserved_ : 7; }; uint value_; }; @@ -94,6 +95,7 @@ class Settings : public device::Settings { uint workloadSplitSize_; //!< Workload split size uint minWorkloadTime_; //!< Minimal workload time in 0.1 ms uint maxWorkloadTime_; //!< Maximum workload time in 0.1 ms + uint blitEngine_; //!< Blit engine type uint cacheLineSize_; //!< Cache line size in bytes uint cacheSize_; //!< L1 cache size in bytes uint numComputeRings_; //!< 0 - disabled, 1 , 2,.. - the number of compute rings diff --git a/projects/clr/rocclr/device/pal/palvirtual.cpp b/projects/clr/rocclr/device/pal/palvirtual.cpp index 3a2081712a..624379d104 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/device/pal/palvirtual.cpp @@ -905,6 +905,11 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs, // because destructor calls eraseResourceList() even if create() failed dev().resizeResoureList(index()); + if (index() >= GPU_MAX_COMMAND_QUEUES) { + // Cap the maximum number of concurrent Virtual GPUs + return false; + } + // Virtual GPU will have profiling enabled state_.profiling_ = profiling; @@ -1015,7 +1020,18 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs, return false; } - blitMgr_ = new KernelBlitManager(*this, blitSetup); + // Choose the appropriate class for blit engine + switch (dev().settings().blitEngine_) { + default: + // Fall through ... + case Settings::BlitEngineHost: + blitSetup.disableAll(); + // Fall through ... + case Settings::BlitEngineCAL: + case Settings::BlitEngineKernel: + blitMgr_ = new KernelBlitManager(*this, blitSetup); + break; + } if ((nullptr == blitMgr_) || !blitMgr_->create(gpuDevice_)) { LogError("Could not create BlitManager!"); return false; @@ -3253,8 +3269,11 @@ void VirtualGPU::waitEventLock(CommandBatch* cb) { cb->lastTS_->value(&startTimeStampGPU, &endTimeStampGPU); uint64_t endTimeStampCPU = amd::Os::timeNanos(); - // Adjust the base time by the execution time - readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU; + // Make sure the command batch has a valid GPU TS + if (!GPU_RAW_TIMESTAMP) { + // Adjust the base time by the execution time + readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU; + } } } } diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp index 95a4d375f0..2151467103 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp @@ -894,35 +894,37 @@ hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, vo Device* dev = reinterpret_cast(data); switch (segment_type) { case HSA_REGION_SEGMENT_GLOBAL: { - uint32_t global_flag = 0; - hsa_status_t stat = - hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag); - if (stat != HSA_STATUS_SUCCESS) { - return stat; - } - - if ((global_flag & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) != 0) { - dev->gpu_fine_grained_segment_ = pool; - } else if ((global_flag & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) != 0) { - dev->gpuvm_segment_ = pool; - - // If cpu agent cannot access this pool, the device does not support large bar. - hsa_amd_memory_pool_access_t tmp{}; - hsa_amd_agent_memory_pool_get_info( - dev->cpu_agent_, - pool, - HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, - &tmp); - - if (tmp == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) { - dev->info_.largeBar_ = false; - } else { - dev->info_.largeBar_ = ROC_ENABLE_LARGE_BAR; + if (dev->settings().enableLocalMemory_) { + uint32_t global_flag = 0; + hsa_status_t stat = + hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag); + if (stat != HSA_STATUS_SUCCESS) { + return stat; } - } - if (dev->gpuvm_segment_.handle == 0) { - dev->gpuvm_segment_ = pool; + if ((global_flag & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) != 0) { + dev->gpu_fine_grained_segment_ = pool; + } else if ((global_flag & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) != 0) { + dev->gpuvm_segment_ = pool; + + // If cpu agent cannot access this pool, the device does not support large bar. + hsa_amd_memory_pool_access_t tmp{}; + hsa_amd_agent_memory_pool_get_info( + dev->cpu_agent_, + pool, + HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, + &tmp); + + if (tmp == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) { + dev->info_.largeBar_ = false; + } else { + dev->info_.largeBar_ = ROC_ENABLE_LARGE_BAR; + } + } + + if (dev->gpuvm_segment_.handle == 0) { + dev->gpuvm_segment_ = pool; + } } break; } @@ -1232,7 +1234,7 @@ bool Device::populateOCLDeviceConstants() { info_.maxWorkItemDimensions_ = 3; - if (gpuvm_segment_.handle != 0) { + if (settings().enableLocalMemory_ && gpuvm_segment_.handle != 0) { size_t global_segment_size = 0; if (HSA_STATUS_SUCCESS != hsa_amd_memory_pool_get_info(gpuvm_segment_, HSA_AMD_MEMORY_POOL_INFO_SIZE, diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp index 275fe45761..3fcbf0391e 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.hpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp @@ -218,7 +218,7 @@ class NullDevice : public amd::Device { //! Determine if we can use device memory for SVM const bool forceFineGrain(amd::Memory* memory) const { - return (memory->getContext().devices().size() > 1); + return !settings().enableCoarseGrainSVM_ || (memory->getContext().devices().size() > 1); } virtual bool importExtSemaphore(void** extSemahore, const amd::Os::FileDesc& handle) { diff --git a/projects/clr/rocclr/device/rocm/rocsettings.cpp b/projects/clr/rocclr/device/rocm/rocsettings.cpp index 24cb88dcbd..71c341ad43 100644 --- a/projects/clr/rocclr/device/rocm/rocsettings.cpp +++ b/projects/clr/rocclr/device/rocm/rocsettings.cpp @@ -35,9 +35,18 @@ Settings::Settings() { // Set this to true when we drop the flag doublePrecision_ = ::CL_KHR_FP64; + enableLocalMemory_ = HSA_LOCAL_MEMORY_ENABLE; + enableCoarseGrainSVM_ = HSA_ENABLE_COARSE_GRAIN_SVM; + maxWorkGroupSize_ = 1024; preferredWorkGroupSize_ = 256; + maxWorkGroupSize2DX_ = 16; + maxWorkGroupSize2DY_ = 16; + maxWorkGroupSize3DX_ = 4; + maxWorkGroupSize3DY_ = 4; + maxWorkGroupSize3DZ_ = 4; + kernargPoolSize_ = HSA_KERNARG_POOL_SIZE; // Determine if user is requesting Non-Coherent mode @@ -192,6 +201,23 @@ void Settings::override() { preferredWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE; } + if (GPU_MAX_WORKGROUP_SIZE_2D_X != 0) { + maxWorkGroupSize2DX_ = GPU_MAX_WORKGROUP_SIZE_2D_X; + } + if (GPU_MAX_WORKGROUP_SIZE_2D_Y != 0) { + maxWorkGroupSize2DY_ = GPU_MAX_WORKGROUP_SIZE_2D_Y; + } + + if (GPU_MAX_WORKGROUP_SIZE_3D_X != 0) { + maxWorkGroupSize3DX_ = GPU_MAX_WORKGROUP_SIZE_3D_X; + } + if (GPU_MAX_WORKGROUP_SIZE_3D_Y != 0) { + maxWorkGroupSize3DY_ = GPU_MAX_WORKGROUP_SIZE_3D_Y; + } + if (GPU_MAX_WORKGROUP_SIZE_3D_Z != 0) { + maxWorkGroupSize3DZ_ = GPU_MAX_WORKGROUP_SIZE_3D_Z; + } + if (!flagIsDefault(GPU_XFER_BUFFER_SIZE)) { xferBufSize_ = GPU_XFER_BUFFER_SIZE * Ki; } diff --git a/projects/clr/rocclr/device/rocm/rocsettings.hpp b/projects/clr/rocclr/device/rocm/rocsettings.hpp index 5b5f81d7d7..d2fffd73db 100644 --- a/projects/clr/rocclr/device/rocm/rocsettings.hpp +++ b/projects/clr/rocclr/device/rocm/rocsettings.hpp @@ -42,6 +42,8 @@ class Settings : public device::Settings { union { struct { uint doublePrecision_ : 1; //!< Enables double precision support + uint enableLocalMemory_ : 1; //!< Enable GPUVM memory + uint enableCoarseGrainSVM_ : 1; //!< Enable device memory for coarse grain SVM allocations uint enableNCMode_ : 1; //!< Enable Non Coherent mode for system memory uint imageDMA_ : 1; //!< Enable direct image DMA transfers uint stagedXferRead_ : 1; //!< Uses a staged buffer read @@ -53,7 +55,7 @@ class Settings : public device::Settings { uint fgs_kernel_arg_ : 1; //!< Use fine grain kernel arg segment uint coop_sync_ : 1; //!< grid and multi-grid sync for gfx940+ uint barrier_value_packet_ : 1; //!< Barrier value packet functionality - uint reserved_ : 20; + uint reserved_ : 18; }; uint value_; }; @@ -64,6 +66,15 @@ class Settings : public device::Settings { //! Preferred workgroup size uint preferredWorkGroupSize_; + //! Default max workgroup sizes for 2D + int maxWorkGroupSize2DX_; + int maxWorkGroupSize2DY_; + + //! Default max workgroup sizes for 3D + int maxWorkGroupSize3DX_; + int maxWorkGroupSize3DY_; + int maxWorkGroupSize3DZ_; + uint kernargPoolSize_; uint numDeviceEvents_; //!< The number of device events uint numWaitEvents_; //!< The number of wait events for device enqueue diff --git a/projects/clr/rocclr/utils/flags.hpp b/projects/clr/rocclr/utils/flags.hpp index f563c6d3bc..186d059b41 100644 --- a/projects/clr/rocclr/utils/flags.hpp +++ b/projects/clr/rocclr/utils/flags.hpp @@ -30,10 +30,22 @@ release(uint, AMD_LOG_MASK, 0X7FFFFFFF, \ "The mask to enable specific kinds of logs") \ debug(uint, DEBUG_GPU_FLAGS, 0, \ "The debug options for GPU device") \ +release(uint, GPU_MAX_COMMAND_QUEUES, 300, \ + "The maximum number of concurrent Virtual GPUs") \ release(size_t, CQ_THREAD_STACK_SIZE, 256*Ki, /* @todo: that much! */ \ "The default command queue thread stack size") \ release(int, GPU_MAX_WORKGROUP_SIZE, 0, \ "Maximum number of workitems in a workgroup for GPU, 0 -use default") \ +release(int, GPU_MAX_WORKGROUP_SIZE_2D_X, 0, \ + "Maximum number of workitems in a 2D workgroup for GPU, x component, 0 -use default") \ +release(int, GPU_MAX_WORKGROUP_SIZE_2D_Y, 0, \ + "Maximum number of workitems in a 2D workgroup for GPU, y component, 0 -use default") \ +release(int, GPU_MAX_WORKGROUP_SIZE_3D_X, 0, \ + "Maximum number of workitems in a 3D workgroup for GPU, x component, 0 -use default") \ +release(int, GPU_MAX_WORKGROUP_SIZE_3D_Y, 0, \ + "Maximum number of workitems in a 3D workgroup for GPU, y component, 0 -use default") \ +release(int, GPU_MAX_WORKGROUP_SIZE_3D_Z, 0, \ + "Maximum number of workitems in a 3D workgroup for GPU, z component, 0 -use default") \ debug(bool, CPU_MEMORY_GUARD_PAGES, false, \ "Use guard pages for CPU memory") \ debug(size_t, CPU_MEMORY_GUARD_PAGE_SIZE, 64, \ @@ -58,8 +70,12 @@ release(uint, GPU_STAGING_BUFFER_SIZE, 4, \ "Size of the GPU staging buffer in MiB") \ release(bool, GPU_DUMP_BLIT_KERNELS, false, \ "Dump the kernels for blit manager") \ +release(uint, GPU_BLIT_ENGINE_TYPE, 0x0, \ + "Blit engine type: 0 - Default, 1 - Host, 2 - CAL, 3 - Kernel") \ release(bool, GPU_FLUSH_ON_EXECUTION, false, \ "Submit commands to HW on every operation. 0 - Disable, 1 - Enable") \ +release(bool, GPU_USE_SYNC_OBJECTS, true, \ + "If enabled, use sync objects instead of polling") \ release(bool, CL_KHR_FP64, true, \ "Enable/Disable support for double precision") \ release(cstring, AMD_OCL_BUILD_OPTIONS, 0, \ @@ -70,8 +86,12 @@ release(cstring, AMD_OCL_LINK_OPTIONS, 0, \ "Set clLinkProgram()'s options (override)") \ release(cstring, AMD_OCL_LINK_OPTIONS_APPEND, 0, \ "Append clLinkProgram()'s options") \ +release(cstring, AMD_OCL_SC_LIB, 0, \ + "Set shader compiler shared library name or path") \ debug(cstring, AMD_OCL_SUBST_OBJFILE, 0, \ "Specify binary substitution config file for OpenCL") \ +debug(bool, AMD_OCL_ENABLE_MESSAGE_BOX, false, \ + "Enable the error dialog on Windows") \ release(size_t, GPU_PINNED_XFER_SIZE, 32, \ "The pinned buffer size for pinning in read/write transfers in MiB") \ release(size_t, GPU_PINNED_MIN_XFER_SIZE, 128, \ @@ -80,6 +100,12 @@ release(size_t, GPU_RESOURCE_CACHE_SIZE, 64, \ "The resource cache size in MB") \ release(size_t, GPU_MAX_SUBALLOC_SIZE, 4096, \ "The maximum size accepted for suballocaitons in KB") \ +release(bool, GPU_FORCE_64BIT_PTR, 0, \ + "Forces 64 bit pointers on GPU") \ +release(bool, GPU_FORCE_OCL20_32BIT, 0, \ + "Forces 32 bit apps to take CLANG\HSAIL path") \ +release(bool, GPU_RAW_TIMESTAMP, 0, \ + "Reports GPU raw timestamps in GPU timeline") \ release(size_t, GPU_NUM_MEM_DEPENDENCY, 256, \ "Number of memory objects for dependency tracking") \ release(size_t, GPU_XFER_BUFFER_SIZE, 0, \ @@ -90,20 +116,32 @@ release(uint, GPU_SINGLE_ALLOC_PERCENT, 85, \ "Maximum size of a single allocation as percentage of total") \ release(uint, GPU_NUM_COMPUTE_RINGS, 2, \ "GPU number of compute rings. 0 - disabled, 1 , 2,.. - the number of compute rings") \ +release(int, GPU_SELECT_COMPUTE_RINGS_ID, -1, \ + "GPU select the compute rings ID -1 - disabled, 0 , 1,.. - the forced compute rings ID for submission") \ release(uint, GPU_WORKLOAD_SPLIT, 22, \ "Workload split size") \ +release(bool, GPU_USE_SINGLE_SCRATCH, false, \ + "Use single scratch buffer per device instead of per HW ring") \ release(bool, AMD_OCL_WAIT_COMMAND, false, \ "1 = Enable a wait for every submitted command") \ release(uint, GPU_PRINT_CHILD_KERNEL, 0, \ "Prints the specified number of the child kernels") \ release(bool, GPU_USE_DEVICE_QUEUE, false, \ "Use a dedicated device queue for the actual submissions") \ +release(bool, GPU_ENABLE_LARGE_ALLOCATION, true, \ + "Enable >4GB single allocations") \ release(bool, AMD_THREAD_TRACE_ENABLE, true, \ "Enable thread trace extension") \ release(uint, OPENCL_VERSION, (IS_BRAHMA ? 120 : 200), \ "Force GPU opencl verison") \ +release(bool, HSA_LOCAL_MEMORY_ENABLE, true, \ + "Enable HSA device local memory usage") \ release(uint, HSA_KERNARG_POOL_SIZE, 1024 * 1024, \ "Kernarg pool size") \ +release(bool, HSA_ENABLE_COARSE_GRAIN_SVM, true, \ + "Enable device memory for coarse grain SVM allocations") \ +release(bool, GPU_IFH_MODE, false, \ + "1 = Enable GPU IFH (infinitely fast hardware) mode. Any other value keeps setting disabled.") \ release(bool, GPU_MIPMAP, true, \ "Enables GPU mipmap extension") \ release(uint, GPU_ENABLE_PAL, 2, \ @@ -114,6 +152,8 @@ release(int, AMD_GPU_FORCE_SINGLE_FP_DENORM, -1, \ "Force denorm for single precision: -1 - don't force, 0 - disable, 1 - enable") \ release(uint, OCL_SET_SVM_SIZE, 4*16384, \ "set SVM space size for discrete GPU") \ +debug(uint, OCL_SYSMEM_REQUIREMENT, 2, \ + "Use flag to change the minimum requirement of system memory not to downgrade") \ release(uint, GPU_WAVES_PER_SIMD, 0, \ "Force the number of waves per SIMD (1-10)") \ release(bool, GPU_WAVE_LIMIT_ENABLE, false, \ @@ -136,6 +176,10 @@ release_on_stg(cstring, GPU_WAVE_LIMIT_DUMP, "", \ "File path prefix for dumping wave limiter output") \ release_on_stg(cstring, GPU_WAVE_LIMIT_TRACE, "", \ "File path prefix for tracing wave limiter") \ +release(bool, OCL_CODE_CACHE_ENABLE, false, \ + "1 = Enable compiler code cache") \ +release(bool, OCL_CODE_CACHE_RESET, false, \ + "1 = Reset the compiler code cache storage") \ release(bool, PAL_DISABLE_SDMA, false, \ "1 = Disable SDMA for PAL") \ release(uint, PAL_RGP_DISP_COUNT, 10000, \