From e631b3978f75dec124f2f8d498c44d6bbb389d69 Mon Sep 17 00:00:00 2001 From: foreman Date: Mon, 7 Jan 2019 18:02:10 -0500 Subject: [PATCH] P4 to Git Change 1727059 by asalmanp@asalmanp-ocl-stg on 2019/01/07 17:48:44 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SWDEV-132899 - [OCL][GFX10] report number of WGP by default on gfx10 ASICs Both HSAIL/SC and LC compilers use WGP mode by default on gfx10 ASICs (i.e., COMPUTE_PGM_RSRC1.WGP_MODE is set to 1 by both compilers) therefore runtime should report number of WGP (i.e., CU/2) on gfx10 ASICs by default. The new environment variable (GPU_ENABLE_WGP_MODE = 0) can be used to force CU mode on LC (i.e., -mcumode option) if itÂ’s needed (HSAIL/SC doesn't have any compiler option for forcing the CU mode) Also, using the new environment variable (GPU_ENABLE_WAVE32_MODE) to control the wave32 mode on gfx10+. ReviewRequestURL = http://ocltc.amd.com/reviews/r/16435/diff/ Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#329 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/devprogram.cpp#27 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#121 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.cpp#65 edit ... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#301 edit --- rocclr/runtime/device/device.hpp | 7 ++-- rocclr/runtime/device/devprogram.cpp | 40 ++++++++++++++++++----- rocclr/runtime/device/pal/paldevice.cpp | 14 +++++--- rocclr/runtime/device/pal/palsettings.cpp | 16 +++++++-- rocclr/runtime/utils/flags.hpp | 6 ++-- 5 files changed, 63 insertions(+), 20 deletions(-) diff --git a/rocclr/runtime/device/device.hpp b/rocclr/runtime/device/device.hpp index 5a3343ef4b..c804713e25 100644 --- a/rocclr/runtime/device/device.hpp +++ b/rocclr/runtime/device/device.hpp @@ -519,9 +519,12 @@ class Settings : public amd::HeapObject { uint reportFMAF_ : 1; //!< Report FP_FAST_FMAF define in CL program uint reportFMA_ : 1; //!< Report FP_FAST_FMA define in CL program uint singleFpDenorm_ : 1; //!< Support Single FP Denorm - uint gfx10Hsail_ : 1 ; //!< GFX10 HSAIL path + uint hsailExplicitXnack_ : 1; //!< Xnack in hsail path for this deivce uint useLightning_ : 1; //!< Enable LC path for this device - uint reserved_ : 18; + uint enableWgpMode_ : 1; //!< Enable WGP mode for this device + uint enableWave32Mode_ : 1; //!< Enable Wave32 mode for this device + uint lcWavefrontSize64_ : 1; //!< Enable Wave64 mode for this device + uint reserved_ : 15; }; uint value_; }; diff --git a/rocclr/runtime/device/devprogram.cpp b/rocclr/runtime/device/devprogram.cpp index 6ae2665f0e..b2ad8e9eba 100644 --- a/rocclr/runtime/device/devprogram.cpp +++ b/rocclr/runtime/device/devprogram.cpp @@ -604,6 +604,14 @@ bool Program::compileImplLC(const std::string& sourceCode, // Set whole program mode driverOptions.append(" -mllvm -amdgpu-early-inline-all -mllvm -amdgpu-prelink"); + if (!device().settings().enableWgpMode_) { + driverOptions.append(" -mcumode"); + } + + if (device().settings().lcWavefrontSize64_) { + driverOptions.append(" -mwavefrontsize64"); + } + // Iterate through each source code and dump it into tmp std::fstream f; std::vector headerFileNames(headers.size()); @@ -1405,6 +1413,14 @@ bool Program::linkImplLC(amd::option::Options* options) { // Set whole program mode codegenOptions.append(" -mllvm -amdgpu-internalize-symbols -mllvm -amdgpu-early-inline-all"); + if (!device().settings().enableWgpMode_) { + codegenOptions.append(" -mcumode"); + } + + if (device().settings().lcWavefrontSize64_) { + codegenOptions.append(" -mwavefrontsize64"); + } + // NOTE: The params is also used to identy cached code object. This parameter // should not contain any dyanamically generated filename. char* executable = nullptr; @@ -1615,6 +1631,14 @@ bool Program::linkImplLC(amd::option::Options* options) { // Set whole program mode codegenOptions.append(" -mllvm -amdgpu-internalize-symbols -mllvm -amdgpu-early-inline-all"); + if (!device().settings().enableWgpMode_) { + codegenOptions.append(" -mcumode"); + } + + if (device().settings().lcWavefrontSize64_) { + codegenOptions.append(" -mwavefrontsize64"); + } + // Tokenize the options string into a vector of strings std::istringstream strstr(codegenOptions); std::istream_iterator sit(strstr), end; @@ -1724,14 +1748,14 @@ bool Program::linkImplHSAIL(amd::option::Options* options) { if (device().isFineGrainedSystem(true)) { fin_options.append(" -sc-xnack-iommu"); } - if (device().settings().gfx10Hsail_) { - if (GPU_FORCE_WAVE_SIZE_32) { - fin_options.append(" -force-wave-size-32"); - } - if (xnackEnabled_) { - fin_options.append(" -xnack"); - } - } + + if (device().settings().enableWave32Mode_) { + fin_options.append(" -force-wave-size-32"); + } + + if (device().settings().hsailExplicitXnack_) { + fin_options.append(" -xnack"); + } errorCode = aclCompile(device().compiler(), binaryElf_, fin_options.c_str(), ACL_TYPE_CG, ACL_TYPE_ISA, logFunction); diff --git a/rocclr/runtime/device/pal/paldevice.cpp b/rocclr/runtime/device/pal/paldevice.cpp index 2b997d0ae5..cd246ae6c2 100644 --- a/rocclr/runtime/device/pal/paldevice.cpp +++ b/rocclr/runtime/device/pal/paldevice.cpp @@ -277,7 +277,7 @@ bool NullDevice::create(Pal::AsicRevision asicRevision, Pal::GfxIpLevel ipLevel, // Runtime doesn't know what local size could be on the real board info_.maxGlobalVariableSize_ = static_cast(512 * Mi); - info_.wavefrontWidth_ = (ipLevel >= Pal::GfxIpLevel::GfxIp10) ? 32 : 64; + info_.wavefrontWidth_ = settings().enableWave32Mode_ ? 32 : 64; if (settings().useLightning_) { #if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY) @@ -351,7 +351,12 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp, info_.vendorId_ = palProp.vendorId; info_.maxWorkItemDimensions_ = 3; - info_.maxComputeUnits_ = palProp.gfxipProperties.shaderCore.numAvailableCus; + + info_.maxComputeUnits_ = settings().enableWgpMode_ ? + palProp.gfxipProperties.shaderCore.numAvailableCus / 2 : + palProp.gfxipProperties.shaderCore.numAvailableCus; + + info_.numberOfShaderEngines = palProp.gfxipProperties.shaderCore.numShaderEngines; // SI parts are scalar. Also, reads don't need to be 128-bits to get peak rates. @@ -611,9 +616,8 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp, info_.cuPerShaderArray_ = palProp.gfxipProperties.shaderCore.numCusPerShaderArray; info_.simdWidth_ = hwInfo()->simdWidth_; info_.simdInstructionWidth_ = hwInfo()->simdInstructionWidth_; - info_.wavefrontWidth_ = ((settings().gfx10Hsail_ && GPU_FORCE_WAVE_SIZE_32) || - (settings().useLightning_ && ipLevel_ >= Pal::GfxIpLevel::GfxIp10_1)) ? - 32 :palProp.gfxipProperties.shaderCore.wavefrontSize; + info_.wavefrontWidth_ = settings().enableWave32Mode_ ? 32: + palProp.gfxipProperties.shaderCore.wavefrontSize; info_.availableSGPRs_ = palProp.gfxipProperties.shaderCore.numAvailableSgprs; info_.globalMemChannelBanks_ = 4; diff --git a/rocclr/runtime/device/pal/palsettings.cpp b/rocclr/runtime/device/pal/palsettings.cpp index 150e71c082..b96eb2d980 100644 --- a/rocclr/runtime/device/pal/palsettings.cpp +++ b/rocclr/runtime/device/pal/palsettings.cpp @@ -140,6 +140,10 @@ Settings::Settings() { maxCmdBuffers_ = 12; useLightning_ = GPU_ENABLE_LC; + enableWgpMode_ = false; + enableWave32Mode_ = false; + hsailExplicitXnack_ = false; + lcWavefrontSize64_ = false; } bool Settings::create(const Pal::DeviceProperties& palProp, @@ -185,7 +189,15 @@ bool Settings::create(const Pal::DeviceProperties& palProp, case Pal::AsicRevision::Navi10: case Pal::AsicRevision::Navi10Lite: gfx10Plus_ = true; - // Fall through to AI (gfx9) ... + hsailExplicitXnack_ = static_cast(palProp.gpuMemoryProperties.flags.pageMigrationEnabled + || palProp.gpuMemoryProperties.flags.iommuv2Support); + enableWgpMode_ = GPU_ENABLE_WGP_MODE || !useLightning_; + enableWave32Mode_ = GPU_ENABLE_WAVE32_MODE; + if (palProp.revision == Pal::AsicRevision::Navi10Lite && useLightning_) { + enableWave32Mode_ = false; + } + lcWavefrontSize64_ = !enableWave32Mode_; + // Fall through to AI (gfx9) ... case Pal::AsicRevision::Vega20: case Pal::AsicRevision::Vega12: case Pal::AsicRevision::Vega10: @@ -451,8 +463,6 @@ bool Settings::create(const Pal::DeviceProperties& palProp, } } - gfx10Hsail_ = gfx10Plus_; - // Override current device settings override(); diff --git a/rocclr/runtime/utils/flags.hpp b/rocclr/runtime/utils/flags.hpp index 99196ee588..deb939e4f5 100644 --- a/rocclr/runtime/utils/flags.hpp +++ b/rocclr/runtime/utils/flags.hpp @@ -169,8 +169,8 @@ release_on_stg(bool, PAL_DISABLE_SDMA, false, \ "1 = Disable SDMA for PAL") \ release(uint, PAL_RGP_DISP_COUNT, 50, \ "The number of dispatches for RGP capture with SQTT") \ -release(bool, GPU_FORCE_WAVE_SIZE_32, false, \ - "Forces WaveSize32 compilation in SC") \ +release(bool, GPU_ENABLE_WAVE32_MODE, true, \ + "Enables Wave32 compilation in HW if available") \ release(bool, GPU_ENABLE_LC, IS_LIGHTNING, \ "Enables LC path") \ release(uint, GPU_MAX_COMMAND_BUFFERS, 8, \ @@ -179,6 +179,8 @@ release(cstring, HIP_VISIBLE_DEVICES, "", "Only devices whose index is present in the sequence are visible to HIP") \ release(cstring, CUDA_VISIBLE_DEVICES, "", \ "Only devices whose index is present in the sequence are visible to HIP") \ +release(bool, GPU_ENABLE_WGP_MODE, true, \ + "Enables WGP Mode in HW if available") \ namespace amd {