From e631b3978f75dec124f2f8d498c44d6bbb389d69 Mon Sep 17 00:00:00 2001
From: foreman
Date: Mon, 7 Jan 2019 18:02:10 -0500
Subject: [PATCH] P4 to Git Change 1727059 by asalmanp@asalmanp-ocl-stg on
2019/01/07 17:48:44
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
SWDEV-132899 - [OCL][GFX10] report number of WGP by default on gfx10 ASICs
Both HSAIL/SC and LC compilers use WGP mode by default on gfx10 ASICs (i.e., COMPUTE_PGM_RSRC1.WGP_MODE is set to 1 by both compilers) therefore runtime should report number of WGP (i.e., CU/2) on gfx10 ASICs by default.
The new environment variable (GPU_ENABLE_WGP_MODE = 0) can be used to force CU mode on LC (i.e., -mcumode option) if itÂ’s needed (HSAIL/SC doesn't have any compiler option for forcing the CU mode)
Also, using the new environment variable (GPU_ENABLE_WAVE32_MODE) to control the wave32 mode on gfx10+.
ReviewRequestURL = http://ocltc.amd.com/reviews/r/16435/diff/
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#329 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devprogram.cpp#27 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#121 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.cpp#65 edit
... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#301 edit
---
rocclr/runtime/device/device.hpp | 7 ++--
rocclr/runtime/device/devprogram.cpp | 40 ++++++++++++++++++-----
rocclr/runtime/device/pal/paldevice.cpp | 14 +++++---
rocclr/runtime/device/pal/palsettings.cpp | 16 +++++++--
rocclr/runtime/utils/flags.hpp | 6 ++--
5 files changed, 63 insertions(+), 20 deletions(-)
diff --git a/rocclr/runtime/device/device.hpp b/rocclr/runtime/device/device.hpp
index 5a3343ef4b..c804713e25 100644
--- a/rocclr/runtime/device/device.hpp
+++ b/rocclr/runtime/device/device.hpp
@@ -519,9 +519,12 @@ class Settings : public amd::HeapObject {
uint reportFMAF_ : 1; //!< Report FP_FAST_FMAF define in CL program
uint reportFMA_ : 1; //!< Report FP_FAST_FMA define in CL program
uint singleFpDenorm_ : 1; //!< Support Single FP Denorm
- uint gfx10Hsail_ : 1 ; //!< GFX10 HSAIL path
+ uint hsailExplicitXnack_ : 1; //!< Xnack in hsail path for this deivce
uint useLightning_ : 1; //!< Enable LC path for this device
- uint reserved_ : 18;
+ uint enableWgpMode_ : 1; //!< Enable WGP mode for this device
+ uint enableWave32Mode_ : 1; //!< Enable Wave32 mode for this device
+ uint lcWavefrontSize64_ : 1; //!< Enable Wave64 mode for this device
+ uint reserved_ : 15;
};
uint value_;
};
diff --git a/rocclr/runtime/device/devprogram.cpp b/rocclr/runtime/device/devprogram.cpp
index 6ae2665f0e..b2ad8e9eba 100644
--- a/rocclr/runtime/device/devprogram.cpp
+++ b/rocclr/runtime/device/devprogram.cpp
@@ -604,6 +604,14 @@ bool Program::compileImplLC(const std::string& sourceCode,
// Set whole program mode
driverOptions.append(" -mllvm -amdgpu-early-inline-all -mllvm -amdgpu-prelink");
+ if (!device().settings().enableWgpMode_) {
+ driverOptions.append(" -mcumode");
+ }
+
+ if (device().settings().lcWavefrontSize64_) {
+ driverOptions.append(" -mwavefrontsize64");
+ }
+
// Iterate through each source code and dump it into tmp
std::fstream f;
std::vector headerFileNames(headers.size());
@@ -1405,6 +1413,14 @@ bool Program::linkImplLC(amd::option::Options* options) {
// Set whole program mode
codegenOptions.append(" -mllvm -amdgpu-internalize-symbols -mllvm -amdgpu-early-inline-all");
+ if (!device().settings().enableWgpMode_) {
+ codegenOptions.append(" -mcumode");
+ }
+
+ if (device().settings().lcWavefrontSize64_) {
+ codegenOptions.append(" -mwavefrontsize64");
+ }
+
// NOTE: The params is also used to identy cached code object. This parameter
// should not contain any dyanamically generated filename.
char* executable = nullptr;
@@ -1615,6 +1631,14 @@ bool Program::linkImplLC(amd::option::Options* options) {
// Set whole program mode
codegenOptions.append(" -mllvm -amdgpu-internalize-symbols -mllvm -amdgpu-early-inline-all");
+ if (!device().settings().enableWgpMode_) {
+ codegenOptions.append(" -mcumode");
+ }
+
+ if (device().settings().lcWavefrontSize64_) {
+ codegenOptions.append(" -mwavefrontsize64");
+ }
+
// Tokenize the options string into a vector of strings
std::istringstream strstr(codegenOptions);
std::istream_iterator sit(strstr), end;
@@ -1724,14 +1748,14 @@ bool Program::linkImplHSAIL(amd::option::Options* options) {
if (device().isFineGrainedSystem(true)) {
fin_options.append(" -sc-xnack-iommu");
}
- if (device().settings().gfx10Hsail_) {
- if (GPU_FORCE_WAVE_SIZE_32) {
- fin_options.append(" -force-wave-size-32");
- }
- if (xnackEnabled_) {
- fin_options.append(" -xnack");
- }
- }
+
+ if (device().settings().enableWave32Mode_) {
+ fin_options.append(" -force-wave-size-32");
+ }
+
+ if (device().settings().hsailExplicitXnack_) {
+ fin_options.append(" -xnack");
+ }
errorCode = aclCompile(device().compiler(), binaryElf_, fin_options.c_str(), ACL_TYPE_CG,
ACL_TYPE_ISA, logFunction);
diff --git a/rocclr/runtime/device/pal/paldevice.cpp b/rocclr/runtime/device/pal/paldevice.cpp
index 2b997d0ae5..cd246ae6c2 100644
--- a/rocclr/runtime/device/pal/paldevice.cpp
+++ b/rocclr/runtime/device/pal/paldevice.cpp
@@ -277,7 +277,7 @@ bool NullDevice::create(Pal::AsicRevision asicRevision, Pal::GfxIpLevel ipLevel,
// Runtime doesn't know what local size could be on the real board
info_.maxGlobalVariableSize_ = static_cast(512 * Mi);
- info_.wavefrontWidth_ = (ipLevel >= Pal::GfxIpLevel::GfxIp10) ? 32 : 64;
+ info_.wavefrontWidth_ = settings().enableWave32Mode_ ? 32 : 64;
if (settings().useLightning_) {
#if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
@@ -351,7 +351,12 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
info_.vendorId_ = palProp.vendorId;
info_.maxWorkItemDimensions_ = 3;
- info_.maxComputeUnits_ = palProp.gfxipProperties.shaderCore.numAvailableCus;
+
+ info_.maxComputeUnits_ = settings().enableWgpMode_ ?
+ palProp.gfxipProperties.shaderCore.numAvailableCus / 2 :
+ palProp.gfxipProperties.shaderCore.numAvailableCus;
+
+
info_.numberOfShaderEngines = palProp.gfxipProperties.shaderCore.numShaderEngines;
// SI parts are scalar. Also, reads don't need to be 128-bits to get peak rates.
@@ -611,9 +616,8 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
info_.cuPerShaderArray_ = palProp.gfxipProperties.shaderCore.numCusPerShaderArray;
info_.simdWidth_ = hwInfo()->simdWidth_;
info_.simdInstructionWidth_ = hwInfo()->simdInstructionWidth_;
- info_.wavefrontWidth_ = ((settings().gfx10Hsail_ && GPU_FORCE_WAVE_SIZE_32) ||
- (settings().useLightning_ && ipLevel_ >= Pal::GfxIpLevel::GfxIp10_1)) ?
- 32 :palProp.gfxipProperties.shaderCore.wavefrontSize;
+ info_.wavefrontWidth_ = settings().enableWave32Mode_ ? 32:
+ palProp.gfxipProperties.shaderCore.wavefrontSize;
info_.availableSGPRs_ = palProp.gfxipProperties.shaderCore.numAvailableSgprs;
info_.globalMemChannelBanks_ = 4;
diff --git a/rocclr/runtime/device/pal/palsettings.cpp b/rocclr/runtime/device/pal/palsettings.cpp
index 150e71c082..b96eb2d980 100644
--- a/rocclr/runtime/device/pal/palsettings.cpp
+++ b/rocclr/runtime/device/pal/palsettings.cpp
@@ -140,6 +140,10 @@ Settings::Settings() {
maxCmdBuffers_ = 12;
useLightning_ = GPU_ENABLE_LC;
+ enableWgpMode_ = false;
+ enableWave32Mode_ = false;
+ hsailExplicitXnack_ = false;
+ lcWavefrontSize64_ = false;
}
bool Settings::create(const Pal::DeviceProperties& palProp,
@@ -185,7 +189,15 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
case Pal::AsicRevision::Navi10:
case Pal::AsicRevision::Navi10Lite:
gfx10Plus_ = true;
- // Fall through to AI (gfx9) ...
+ hsailExplicitXnack_ = static_cast(palProp.gpuMemoryProperties.flags.pageMigrationEnabled
+ || palProp.gpuMemoryProperties.flags.iommuv2Support);
+ enableWgpMode_ = GPU_ENABLE_WGP_MODE || !useLightning_;
+ enableWave32Mode_ = GPU_ENABLE_WAVE32_MODE;
+ if (palProp.revision == Pal::AsicRevision::Navi10Lite && useLightning_) {
+ enableWave32Mode_ = false;
+ }
+ lcWavefrontSize64_ = !enableWave32Mode_;
+ // Fall through to AI (gfx9) ...
case Pal::AsicRevision::Vega20:
case Pal::AsicRevision::Vega12:
case Pal::AsicRevision::Vega10:
@@ -451,8 +463,6 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
}
}
- gfx10Hsail_ = gfx10Plus_;
-
// Override current device settings
override();
diff --git a/rocclr/runtime/utils/flags.hpp b/rocclr/runtime/utils/flags.hpp
index 99196ee588..deb939e4f5 100644
--- a/rocclr/runtime/utils/flags.hpp
+++ b/rocclr/runtime/utils/flags.hpp
@@ -169,8 +169,8 @@ release_on_stg(bool, PAL_DISABLE_SDMA, false, \
"1 = Disable SDMA for PAL") \
release(uint, PAL_RGP_DISP_COUNT, 50, \
"The number of dispatches for RGP capture with SQTT") \
-release(bool, GPU_FORCE_WAVE_SIZE_32, false, \
- "Forces WaveSize32 compilation in SC") \
+release(bool, GPU_ENABLE_WAVE32_MODE, true, \
+ "Enables Wave32 compilation in HW if available") \
release(bool, GPU_ENABLE_LC, IS_LIGHTNING, \
"Enables LC path") \
release(uint, GPU_MAX_COMMAND_BUFFERS, 8, \
@@ -179,6 +179,8 @@ release(cstring, HIP_VISIBLE_DEVICES, "",
"Only devices whose index is present in the sequence are visible to HIP") \
release(cstring, CUDA_VISIBLE_DEVICES, "", \
"Only devices whose index is present in the sequence are visible to HIP") \
+release(bool, GPU_ENABLE_WGP_MODE, true, \
+ "Enables WGP Mode in HW if available") \
namespace amd {