5e3d4f5a01
SWDEV-82596 - HSA HLC: Create AMDInline pass The generic llvm inlining heuristcs do not work well for GPU. In particular we have a common problem in several tests: If we have a pointer to private array passed into a function it will not be optimized out, leaving scratch usage. The pass increases the inline threshold to allow inliniting in this case. Also that we can move at least some AMD inlining customizations into this file from the common code. Inline hint threshold is moved in this change. Performance impact on ocltst sha256, 32 bit, Fiji: AMDIL HSAIL Diff HSAIL+Inliner Diff Diff before to AMDIL to HSAIL to AMDIL OCLPerfSHA256[ 0] 43.843 40.894 0.93 69.910 1.71 1.59 OCLPerfSHA256[ 1] 53.611 51.083 0.95 80.919 1.58 1.51 OCLPerfSHA256[ 2] 52.127 51.528 0.99 80.640 1.56 1.55 OCLPerfSHA256[ 3] 60.952 57.027 0.94 68.615 1.20 1.13 OCLPerfSHA256[ 4] 76.173 70.150 0.92 80.582 1.15 1.06 OCLPerfSHA256[ 5] 75.886 70.264 0.93 81.000 1.15 1.07 Testing: smoke, precheckin, ocltst sha256 Reviewed by Danill Fukalov Affected files ... ... //depot/stg/opencl/drivers/opencl/compiler/lib/backends/common/opt_level.cpp#28 edit ... //depot/stg/opencl/drivers/opencl/compiler/llvm/include/llvm/InitializePasses.h#93 edit ... //depot/stg/opencl/drivers/opencl/compiler/llvm/include/llvm/LinkAllPasses.h#49 edit ... //depot/stg/opencl/drivers/opencl/compiler/llvm/include/llvm/Transforms/IPO.h#32 edit ... //depot/stg/opencl/drivers/opencl/compiler/llvm/lib/Transforms/IPO/AMDInline.cpp#1 add ... //depot/stg/opencl/drivers/opencl/compiler/llvm/lib/Transforms/IPO/CMakeLists.txt#24 edit ... //depot/stg/opencl/drivers/opencl/compiler/llvm/lib/Transforms/IPO/IPO.cpp#32 edit ... //depot/stg/opencl/drivers/opencl/compiler/llvm/lib/Transforms/IPO/Inliner.cpp#42 edit ... //depot/stg/opencl/drivers/opencl/compiler/llvm/tools/opt/amdopt.inc#28 edit
272 строки
7.7 KiB
C++
272 строки
7.7 KiB
C++
//
|
|
// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
|
|
//
|
|
#include "top.hpp"
|
|
#include "opt_level.hpp"
|
|
#include "library.hpp"
|
|
#include "acl.h"
|
|
#include "utils/options.hpp"
|
|
#include "utils/target_mappings.h"
|
|
#include "utils/libUtils.h"
|
|
#include "llvm/Analysis/Passes.h"
|
|
#if defined(LEGACY_COMPLIB)
|
|
#include "llvm/DataLayout.h"
|
|
#include "llvm/Module.h"
|
|
#endif
|
|
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
|
|
#include "llvm/LinkAllPasses.h"
|
|
#include "llvm/Support/TargetRegistry.h"
|
|
#include "llvm/Transforms/IPO/AMDOptOptions.h"
|
|
#include "compiler_stage.hpp"
|
|
using namespace amdcl;
|
|
using namespace llvm;
|
|
|
|
void
|
|
OptLevel::setup(bool isGPU, uint32_t OptLevel)
|
|
{
|
|
// Add an appropriate DataLayout instance for this module.
|
|
#if defined(LEGACY_COMPLIB)
|
|
Passes().add(new DataLayout(module_));
|
|
#else
|
|
Passes().add(new DataLayoutPass());
|
|
#endif
|
|
fpasses_ = new FunctionPassManager(module_);
|
|
#if defined(LEGACY_COMPLIB)
|
|
fpasses_->add(new DataLayout(module_));
|
|
#else
|
|
fpasses_->add(new DataLayoutPass());
|
|
#endif
|
|
|
|
PassManagerBuilder Builder;
|
|
Builder.OptLevel = OptLevel;
|
|
|
|
if (Options()->libraryType_ == amd::GPU_Library_HSAIL) {
|
|
if (OptLevel == 0) return;
|
|
}
|
|
|
|
if (!Options()->oVariables->Inline) {
|
|
// No inlining pass
|
|
} else if (isGPU) {
|
|
#ifdef WITH_TARGET_HSAIL
|
|
if (Options()->libraryType_ == amd::GPU_Library_HSAIL) {
|
|
if (HLC_HSAIL_Enable_Calls) {
|
|
HLC_Disable_Amd_Inline_All = true;
|
|
} else {
|
|
HLC_Disable_Amd_Inline_All = false;
|
|
}
|
|
// Always create Inliner regardless of OptLevel
|
|
if (HLC_Force_Always_Inliner_Pass) {
|
|
Builder.Inliner = createAlwaysInlinerPass();
|
|
} else {
|
|
Builder.Inliner = createAMDFunctionInliningPass(HLC_HSAIL_Inline_Threshold);
|
|
}
|
|
} else
|
|
#endif
|
|
{
|
|
HLC_Disable_Amd_Inline_All = false;
|
|
// Always create Inliner regardless of OptLevel
|
|
Builder.Inliner = createAMDFunctionInliningPass(500);
|
|
}
|
|
} else if (OptLevel > 1) {
|
|
unsigned Threshold = 225;
|
|
if (OptLevel > 2)
|
|
Threshold = 275;
|
|
#ifdef WITH_TARGET_HSAIL
|
|
if (Options()->libraryType_ == amd::GPU_Library_HSAIL) {
|
|
// Don't do inlining (including createAlwaysInlinerPass()) if OptimizationLevel
|
|
// is zero becaue we are generating code for -g
|
|
if (OptLevel > 0) {
|
|
Builder.Inliner = createAMDFunctionInliningPass(Threshold);
|
|
}
|
|
} else
|
|
#endif
|
|
{
|
|
Builder.Inliner = createAMDFunctionInliningPass(Threshold);
|
|
}
|
|
}
|
|
Builder.SizeLevel = 0;
|
|
Builder.DisableUnitAtATime = false;
|
|
Builder.DisableUnrollLoops = OptLevel == 0;
|
|
#if defined(LEGACY_COMPLIB)
|
|
if (Options()->libraryType_ != amd::GPU_Library_HSAIL)
|
|
Builder.DisableSimplifyLibCalls = true;
|
|
#endif
|
|
Builder.AMDpopulateFunctionPassManager(*fpasses_, &module_->getContext());
|
|
Builder.AMDpopulateModulePassManager(passes_, &module_->getContext(), module_);
|
|
}
|
|
|
|
void
|
|
OptLevel::run(aclBinary *elf)
|
|
{
|
|
#if !defined(LEGACY_COMPLIB)
|
|
const aclTargetInfo* trg = aclutGetTargetInfo(elf);
|
|
TargetMachine *Machine = nullptr;
|
|
if (trg) {
|
|
llvm::Triple TheTriple(getTriple(trg->arch_id));
|
|
if (TheTriple.getArch()) {
|
|
std::string Error;
|
|
llvm::StringRef MArch(aclGetArchitecture(*trg));
|
|
const Target *TheTarget = TargetRegistry::lookupTarget(MArch, TheTriple,
|
|
Error);
|
|
if (TheTarget) {
|
|
llvm::TargetOptions targetOptions;
|
|
targetOptions.NoFramePointerElim = false;
|
|
targetOptions.StackAlignmentOverride = Options()->oVariables->CPUStackAlignment;
|
|
#ifdef WITH_TARGET_HSAIL
|
|
if (Options()->libraryType_ == amd::GPU_Library_HSAIL)
|
|
targetOptions.UnsafeFPMath = Options()->oVariables->UnsafeMathOpt;
|
|
#endif
|
|
targetOptions.LessPreciseFPMADOption = Options()->oVariables->MadEnable ||
|
|
Options()->oVariables->EnableMAD;
|
|
targetOptions.NoInfsFPMath = Options()->oVariables->FiniteMathOnly;
|
|
targetOptions.NoNaNsFPMath = Options()->oVariables->FastRelaxedMath;
|
|
|
|
llvm::CodeGenOpt::Level OLvl = CodeGenOpt::None;
|
|
switch (Options()->oVariables->OptLevel) {
|
|
case 0: // -O0
|
|
OLvl = CodeGenOpt::None;
|
|
break;
|
|
case 1: // -O1
|
|
OLvl = CodeGenOpt::Less;
|
|
break;
|
|
case 2: // -O2
|
|
case 5: // -O5(-Os)
|
|
OLvl = CodeGenOpt::Default;
|
|
break;
|
|
case 3: // -O3
|
|
case 4: // -O4
|
|
OLvl = CodeGenOpt::Aggressive;
|
|
break;
|
|
default:
|
|
assert(!"Error with optimization level");
|
|
};
|
|
|
|
Machine = TheTarget->createTargetMachine(TheTriple.getTriple(),
|
|
aclutGetCodegenName(elf->target),
|
|
"", targetOptions,
|
|
WINDOWS_SWITCH(Reloc::DynamicNoPIC, Reloc::PIC_),
|
|
CodeModel::Default, OLvl);
|
|
}
|
|
}
|
|
}
|
|
std::unique_ptr<TargetMachine> TM(Machine);
|
|
if (TM.get())
|
|
TM->addAnalysisPasses(passes_);
|
|
#endif
|
|
|
|
if (Options()->oVariables->OptPrintLiveness) {
|
|
Passes().add(createAMDLivenessPrinterPass());
|
|
}
|
|
fpasses_->doInitialization();
|
|
for (Module::iterator I = module_->begin(), E = module_->end(); I != E; ++I)
|
|
fpasses_->run(*I);
|
|
fpasses_->doFinalization();
|
|
// Now that we have all of the passes ready, run them.
|
|
passes_.run(*module_);
|
|
|
|
delete fpasses_;
|
|
}
|
|
|
|
int
|
|
O0OptLevel::optimize(aclBinary *elf, Module *input, bool isGPU)
|
|
{
|
|
// With -O0, we don't do anything
|
|
module_ = input;
|
|
#ifdef WITH_TARGET_HSAIL
|
|
if (Options()->libraryType_ == amd::GPU_Library_HSAIL) {
|
|
// Mark all non-kernel functions as having internal linkage
|
|
Passes().add(createAMDSymbolLinkagePass(true, NULL));
|
|
} else
|
|
#endif
|
|
{
|
|
setup(false, 0);
|
|
run(elf);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
int
|
|
GPUO0OptLevel::optimize(aclBinary *elf, Module *input, bool isGPU)
|
|
{
|
|
module_ = input;
|
|
assert(isGPU && "Only a GPU can use GPUO0OptLevel!\n");
|
|
setup(true, 0);
|
|
#ifdef WITH_TARGET_HSAIL
|
|
if (Options()->libraryType_ == amd::GPU_Library_HSAIL) {
|
|
// On the GPU, even with -O0, we must do some optimizations. One
|
|
// goal is to ensure that all functions are inlined. This requires
|
|
// three steps in that order:
|
|
//
|
|
// 1. Mark all non-kernel functions as having internal linkage.
|
|
// 2. Invoke the GlobalOptimizer to resolve function aliases.
|
|
// 3. Force inlining using our custom inliner pass.
|
|
if (Options()->oVariables->EnableDebug) {
|
|
HLC_HSAIL_Enable_Calls = false;
|
|
HLC_Disable_Amd_Inline_All = false;
|
|
}
|
|
else if (HLC_HSAIL_Enable_Calls) {
|
|
HLC_Disable_Amd_Inline_All = true;
|
|
}
|
|
else {
|
|
HLC_Disable_Amd_Inline_All = false;
|
|
}
|
|
Passes().add(createAMDSymbolLinkagePass(true, NULL));
|
|
Passes().add(createGlobalOptimizerPass());
|
|
if (!HLC_Disable_Amd_Inline_All &&
|
|
!DisableInline &&
|
|
!HLC_Force_Always_Inliner_Pass) {
|
|
Passes().add(createAMDInlineAllPass(true));
|
|
} else {
|
|
Passes().add(createAlwaysInlinerPass());
|
|
}
|
|
}
|
|
#endif
|
|
run(elf);
|
|
return 0;
|
|
}
|
|
|
|
int
|
|
O1OptLevel::optimize(aclBinary *elf, Module *input, bool isGPU)
|
|
{
|
|
module_ = input;
|
|
setup(isGPU, 1);
|
|
run(elf);
|
|
return 0;
|
|
}
|
|
|
|
int
|
|
O2OptLevel::optimize(aclBinary *elf, Module *input, bool isGPU)
|
|
{
|
|
module_ = input;
|
|
setup(isGPU, 2);
|
|
run(elf);
|
|
return 0;
|
|
}
|
|
|
|
int
|
|
O3OptLevel::optimize(aclBinary *elf, Module *input, bool isGPU)
|
|
{
|
|
module_ = input;
|
|
setup(isGPU, 3);
|
|
run(elf);
|
|
return 0;
|
|
}
|
|
|
|
int
|
|
O4OptLevel::optimize(aclBinary *elf, Module *input, bool isGPU)
|
|
{
|
|
module_ = input;
|
|
setup(isGPU, 4);
|
|
run(elf);
|
|
return 0;
|
|
}
|
|
|
|
int
|
|
OsOptLevel::optimize(aclBinary *elf, Module *input, bool isGPU)
|
|
{
|
|
module_ = input;
|
|
setup(isGPU, 5);
|
|
run(elf);
|
|
return 0;
|
|
}
|