5f67b5bb68
ECR #333753 - TargetMachine is created not only for codegen, but for optimizer as well This is to provide target specific optimizations in the intermediate optimizer. LLVM 3.6 provides TargetTransformInfo for this purpose which requires TargetMachine. No correctness issues will occur if TargetInfo is not created, but optmizations will target generic machine. Testing: smoke, precheckin Reviewed by Evgeny Mankov Affected files ... ... //depot/stg/opencl/drivers/opencl/compiler/lib/backends/common/codegen.cpp#60 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/backends/common/opt_level.cpp#23 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/backends/common/opt_level.hpp#3 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/backends/common/optimizer.cpp#25 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/utils/v0_8/libUtils.cpp#6 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/utils/v0_8/libUtils.h#16 edit
351 строка
11 KiB
C++
351 строка
11 KiB
C++
//
|
|
// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
|
|
//
|
|
#include "top.hpp"
|
|
#include "codegen.hpp"
|
|
#include "utils/libUtils.h"
|
|
#include "os/os.hpp"
|
|
#include "jit/src/jit.hpp"
|
|
#include "utils/target_mappings.h"
|
|
#ifdef _MSC_VER
|
|
/* for disabling warning in llvm/ADT/Statistic.h */
|
|
#pragma warning(disable:4146)
|
|
#endif
|
|
#include "llvm/ADT/Statistic.h"
|
|
#ifdef _MSC_VER
|
|
#pragma warning(default:4146)
|
|
#endif
|
|
#include "llvm/Support/FormattedStream.h"
|
|
#include "llvm/Support/Host.h"
|
|
#include "llvm/Support/raw_ostream.h"
|
|
#include "llvm/Support/TargetSelect.h"
|
|
#include "llvm/DataLayout.h"
|
|
#include "llvm/Target/TargetMachine.h"
|
|
#include "llvm/Target/TargetOptions.h"
|
|
#include "llvm/Support/TargetRegistry.h"
|
|
#include "llvm/Support/CommandLine.h"
|
|
#include <iostream>
|
|
#include <sstream>
|
|
#include <fstream>
|
|
#include <memory>
|
|
|
|
using namespace amdcl;
|
|
using namespace llvm;
|
|
|
|
/*! Function that modifies the code gen level based on the
|
|
* function size threshhold.
|
|
*/
|
|
static CodeGenOpt::Level
|
|
AdjustCGOptLevel(Module& M, CodeGenOpt::Level OrigOLvl)
|
|
{
|
|
const unsigned int FuncSizeThreshold = 10000;
|
|
if (OrigOLvl == CodeGenOpt::None)
|
|
return OrigOLvl;
|
|
for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
|
|
Function *F = (Function *)I;
|
|
if (F->size() > FuncSizeThreshold) {
|
|
return CodeGenOpt::None;
|
|
}
|
|
}
|
|
return OrigOLvl;
|
|
}
|
|
|
|
int
|
|
llvmCodeGen(
|
|
Module* Composite,
|
|
amd::option::Options *OptionsObj,
|
|
std::string& output,
|
|
aclBinary* binary)
|
|
{
|
|
const FamilyMapping &familyMap = familySet[binary->target.arch_id];
|
|
const bool optimize = (OptionsObj ? (OptionsObj->oVariables->OptLevel > 0) : true);
|
|
const TargetMapping* targetMap = familyMap.target;
|
|
unsigned famID = binary->target.chip_id;
|
|
if (!targetMap || !targetMap[famID].supported) {
|
|
LogError("Device is not supported by code generator!");
|
|
return 1;
|
|
}
|
|
|
|
#if 1 || LLVM_TRUNK_INTEGRATION_CL >= 1463
|
|
#else
|
|
// a dirty way to guarantee "push bp" inserted by CodeGen in prologue
|
|
llvm::NoFramePointerElim = !optimize;
|
|
#endif
|
|
// Load the module to be compiled...
|
|
Module &mod = *Composite;
|
|
|
|
// FIXME: The triple given in this map is wrong and isn't really
|
|
// useful. Only need the architecture.
|
|
const std::string TargetTriple = std::string(familyMap.triple);
|
|
Triple TheTriple(TargetTriple);
|
|
if (TheTriple.getTriple().empty()) {
|
|
TheTriple.setTriple(sys::getDefaultTargetTriple());
|
|
}
|
|
|
|
Triple::ArchType arch = TheTriple.getArch();
|
|
|
|
bool isGPU = (arch == Triple::amdil || arch == Triple::amdil64 ||
|
|
arch == Triple::hsail || arch == Triple::hsail64);
|
|
|
|
if (isGPU) {
|
|
TheTriple.setOS(Triple::UnknownOS);
|
|
} else { // CPUs
|
|
// FIXME: This should come from somewhere else.
|
|
#ifdef __linux__
|
|
TheTriple.setOS(Triple::Linux);
|
|
#else
|
|
TheTriple.setOS(Triple::MinGW32);
|
|
#endif
|
|
}
|
|
|
|
TheTriple.setEnvironment(Triple::AMDOpenCL);
|
|
// FIXME: need to make AMDOpenCL be the same as ELF
|
|
if (OptionsObj->oVariables->UseJIT)
|
|
TheTriple.setEnvironment(Triple::ELF);
|
|
mod.setTargetTriple(TheTriple.getTriple());
|
|
|
|
// Allocate target machine. First, check whether the user has explicitly
|
|
// specified an architecture to compile for. If so we have to look it up by
|
|
// name, because it might be a backend that has no mapping to a target triple.
|
|
const Target *TheTarget = 0;
|
|
assert(binary->target.arch_id != aclError && "Cannot have the error device!");
|
|
|
|
std::string MArch = familyMap.architecture;
|
|
|
|
#ifdef WITH_TARGET_HSAIL
|
|
if (MArch == "hsail" && OptionsObj->oVariables->GPU64BitIsa) {
|
|
MArch = std::string("hsail-64");
|
|
}
|
|
#endif
|
|
|
|
for (TargetRegistry::iterator it = TargetRegistry::begin(),
|
|
ie = TargetRegistry::end(); it != ie; ++it) {
|
|
if (MArch == it->getName()) {
|
|
TheTarget = &*it;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!TheTarget) {
|
|
errs() << ": ERROR: invalid target '" << MArch << "'.\n";
|
|
return 1;
|
|
}
|
|
|
|
CodeGenOpt::Level OLvl = CodeGenOpt::None;
|
|
switch (OptionsObj->oVariables->OptLevel) {
|
|
case 0: // -O0
|
|
OLvl = CodeGenOpt::None;
|
|
break;
|
|
case 1: // -O1
|
|
OLvl = CodeGenOpt::Less;
|
|
break;
|
|
default:
|
|
assert(!"Error with optimization level");
|
|
case 2: // -O2
|
|
case 5: // -O5(-Os)
|
|
OLvl = CodeGenOpt::Default;
|
|
break;
|
|
case 3: // -O3
|
|
case 4: // -O4
|
|
OLvl = CodeGenOpt::Aggressive;
|
|
break;
|
|
};
|
|
|
|
// If there is a very big function, lower the optimization level.
|
|
OLvl = AdjustCGOptLevel(mod, OLvl);
|
|
|
|
// Adjust the triple to match (if known), otherwise stick with the
|
|
// module/host triple.
|
|
Triple::ArchType Type = Triple::getArchTypeForLLVMName(MArch);
|
|
if (Type != Triple::UnknownArch)
|
|
TheTriple.setArch(Type);
|
|
|
|
// Package up features to be passed to target/subtarget
|
|
std::string FeatureStr;
|
|
if ((Type == Triple::amdil || Type == Triple::amdil64) &&
|
|
targetMap[famID].chip_options) {
|
|
uint64_t y = targetMap[famID].chip_options;
|
|
for (uint64_t x = 0; y != 0; y >>= 1, ++x) {
|
|
if (!(y & 0x1) && (x >= 11 && x < 16)) {
|
|
continue;
|
|
}
|
|
|
|
if ((1 << x) == F_NO_ALIAS) {
|
|
FeatureStr += (!OptionsObj->oVariables->AssumeAlias ? '+' : '-');
|
|
} else if ((1 << x) == F_STACK_UAV) {
|
|
FeatureStr += (OptionsObj->oVariables->UseStackUAV ? '+' : '-');
|
|
} else if ((1 << x) == F_MACRO_CALL) {
|
|
FeatureStr += (OptionsObj->oVariables->UseMacroForCall ? '+' : '-');
|
|
} else if ((1 << x) == F_64BIT_PTR) {
|
|
FeatureStr += (binary->target.arch_id == aclAMDIL64) ? '+' : '-';
|
|
} else {
|
|
FeatureStr += ((y & 0x1) ? '+' : '-');
|
|
}
|
|
|
|
FeatureStr += GPUCodeGenFlagTable[x];
|
|
if (y != 0x1) {
|
|
FeatureStr += ',';
|
|
}
|
|
}
|
|
}
|
|
|
|
if (Type == Triple::amdil64) {
|
|
if (OptionsObj->oVariables->SmallGlobalObjects)
|
|
FeatureStr += ",+small-global-objects";
|
|
}
|
|
|
|
#if 1 || LLVM_TRUNK_INTEGRATION_CL >= 1463
|
|
llvm::TargetOptions targetOptions;
|
|
targetOptions.NoFramePointerElim = false;
|
|
targetOptions.StackAlignmentOverride =
|
|
OptionsObj->oVariables->CPUStackAlignment;
|
|
// jgolds
|
|
//targetOptions.EnableEBB = (optimize && OptionsObj->oVariables->CGEBB);
|
|
//targetOptions.EnableBFO = OptionsObj->oVariables->CGBFO;
|
|
//targetOptions.NoExcessFPPrecision = !OptionsObj->oVariables->EnableFMA;
|
|
|
|
// Don't allow unsafe optimizations for CPU because the library
|
|
// contains code that is not safe. See bug 9567.
|
|
if (isGPU)
|
|
targetOptions.UnsafeFPMath = OptionsObj->oVariables->UnsafeMathOpt;
|
|
targetOptions.LessPreciseFPMADOption = OptionsObj->oVariables->MadEnable ||
|
|
OptionsObj->oVariables->EnableMAD;
|
|
targetOptions.NoInfsFPMath = OptionsObj->oVariables->FiniteMathOnly;
|
|
// Need to add a support for OptionsObj->oVariables->NoSignedZeros,
|
|
targetOptions.NoNaNsFPMath = OptionsObj->oVariables->FastRelaxedMath;
|
|
|
|
std::auto_ptr<TargetMachine>
|
|
target(TheTarget->createTargetMachine(TheTriple.getTriple(),
|
|
aclutGetCodegenName(binary->target), FeatureStr, targetOptions,
|
|
WINDOWS_SWITCH(Reloc::DynamicNoPIC, Reloc::PIC_),
|
|
CodeModel::Default, OLvl));
|
|
#else
|
|
std::auto_ptr<TargetMachine>
|
|
target(TheTarget->createTargetMachine(TheTriple.getTriple(),
|
|
aclutGetCodegenName(binary->target), FeatureStr,
|
|
WINDOWS_SWITCH(Reloc::DynamicNoPIC, Reloc::PIC_),
|
|
CodeModel::Default));
|
|
assert(target.get() && "Could not allocate target machine!");
|
|
#endif
|
|
|
|
// MCJIT(Jan)
|
|
if(!isGPU && OptionsObj->oVariables->UseJIT) {
|
|
TargetMachine* jittarget(TheTarget->createTargetMachine(TheTriple.getTriple(),
|
|
aclutGetCodegenName(binary->target), FeatureStr, targetOptions,
|
|
WINDOWS_SWITCH(Reloc::DynamicNoPIC, Reloc::PIC_),
|
|
CodeModel::Default, OLvl));
|
|
|
|
std::string ErrStr = jitCodeGen(Composite, jittarget, OLvl, output);
|
|
|
|
if (!ErrStr.empty()) {
|
|
LogError("MCJIT failed to generate code");
|
|
LogError(ErrStr.c_str());
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
|
|
TargetMachine &Target = *target;
|
|
|
|
// Figure out where we are going to send the output...
|
|
raw_string_ostream *RSOut = new raw_string_ostream(output);
|
|
formatted_raw_ostream *Out = new formatted_raw_ostream(*RSOut, formatted_raw_ostream::DELETE_STREAM);
|
|
if (Out == 0) {
|
|
LogError("llvmCodeGen couldn't create an output stream");
|
|
return 1;
|
|
}
|
|
|
|
// Build up all of the passes that we want to do to the module or function or
|
|
// Basic Block.
|
|
PassManager Passes;
|
|
|
|
// Add the target data from the target machine, if it exists, or the module.
|
|
if (const DataLayout *TD = Target.getDataLayout())
|
|
Passes.add(new DataLayout(*TD));
|
|
else
|
|
Passes.add(new DataLayout(&mod));
|
|
|
|
// Override default to generate verbose assembly, if the device is not the GPU.
|
|
// The GPU sets this in AMDILTargetMachine.cpp.
|
|
if (familyMap.target == (const TargetMapping*)&X86TargetMapping ||
|
|
#if WITH_VERSION_0_9
|
|
familyMap.target == (const TargetMapping*)&A32TargetMapping ||
|
|
familyMap.target == (const TargetMapping*)&A32TargetMapping ||
|
|
#elif WITH_VERSION_0_8
|
|
#else
|
|
#error "The current version implementation was not implemented here."
|
|
#endif
|
|
familyMap.target == (const TargetMapping*)&X64TargetMapping
|
|
) {
|
|
Target.setAsmVerbosityDefault(true);
|
|
}
|
|
|
|
#ifdef WITH_TARGET_HSAIL
|
|
if (isHSAILTarget(binary->target)) {
|
|
if (Target.addPassesToEmitFile(Passes, *Out, TargetMachine::CGFT_ObjectFile, true)) {
|
|
delete Out;
|
|
return 1;
|
|
}
|
|
} else
|
|
#endif
|
|
{
|
|
#ifndef NDEBUG
|
|
#if 1 || LLVM_TRUNK_INTEGRATION_CL >= 1144
|
|
if (Target.addPassesToEmitFile(Passes, *Out, TargetMachine::CGFT_AssemblyFile, false))
|
|
#else
|
|
if (Target.addPassesToEmitFile(Passes, *Out, TargetMachine::CGFT_AssemblyFile, OLvl, false))
|
|
#endif
|
|
#else
|
|
#if 1 || LLVM_TRUNK_INTEGRATION_CL >= 1144
|
|
if (Target.addPassesToEmitFile(Passes, *Out, TargetMachine::CGFT_AssemblyFile, true))
|
|
#else
|
|
if (Target.addPassesToEmitFile(Passes, *Out, TargetMachine::CGFT_AssemblyFile, OLvl, true))
|
|
#endif
|
|
#endif
|
|
{
|
|
delete Out;
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
Passes.run(mod);
|
|
llvm::PrintStatistics();
|
|
delete Out;
|
|
return 0;
|
|
}
|
|
|
|
int
|
|
CLCodeGen::codegen(llvm::Module *input)
|
|
{
|
|
uint64_t time_cg = 0ULL;
|
|
if (Options()->oVariables->EnableBuildTiming) {
|
|
time_cg = amd::Os::timeNanos();
|
|
}
|
|
llvmbinary_ = input;
|
|
amdcl::CompilerStage *cs = reinterpret_cast<amdcl::CompilerStage*>(this);
|
|
if (!isHSAILTarget(cs->Elf()->target)) {
|
|
setWholeProgram(true);
|
|
}
|
|
|
|
int ret = llvmCodeGen(LLVMBinary(), Options(), Source(), Elf());
|
|
|
|
if (Options()->oVariables->EnableBuildTiming) {
|
|
time_cg = amd::Os::timeNanos() - time_cg;
|
|
std::stringstream tmp_ss;
|
|
tmp_ss << " LLVM CodeGen time: "
|
|
<< time_cg/1000ULL
|
|
<< "us\n";
|
|
appendLogToCL(CL(), tmp_ss.str());
|
|
}
|
|
if (!Source().empty() && Options()->isDumpFlagSet(amd::option::DUMP_CGIL)) {
|
|
std::string ilFileName = Options()->getDumpFileName(".il");
|
|
std::fstream f;
|
|
f.open(ilFileName.c_str(), (std::fstream::out | std::fstream::binary));
|
|
f.write(Source().data(), Source().length());
|
|
f.close();
|
|
}
|
|
|
|
return ret;
|
|
}
|