Files
rocm-systems/rocclr/compiler/lib/backends/common/codegen.cpp
T
foreman efa52f77ef P4 to Git Change 1264269 by smekhano@stas-nova-hsa on 2016/05/02 16:32:25
SWDEV-77584 - HSA HLC: refactoring of min/max processing and folding

	1. Fixed correctness bug: if a source contains code like (x > y) ? x : y, HLC was folding
	this and similar patterns to min and max instructions. The problem is with NaN handling.
	Such a pattern may return NaN if one of two arguments is a NaN. All our instructions return
	a number in this case, except for gcn instruction returning a qNaN if input is sNaN.
	For a qNaN a number is retuned in any way. Therefor such folding is only correct if NaN handling
	is disabled. Patterns are predicated to work with -cl-finite-math-only or -cl-fast-relaxed-math
	which includes the former option.

	NB: Performance regressions are expected in programs which do not use either of these options.

	2. Compiler lib did hot handle -cl-finite-math-only. Also added handling of -cl-no-signed-zeros,
	even though it does not affect code generation because there is no llvm counterpart for it.

	3. Patterns for NaN agnostic comparison codes are added. We are getting these in case if finite
	only math is requested.

	4. Removed patterns for __hsail_min_f* and __hsail_max_f*. Instead these intrinsics are lowered
	to fminnum and fmaxnum llvm operations with the same semantics. This allows to decrease the number
	of patterns and simplify handling.

	5. For f32 we were only producing gcn versions min and max with source patterns if gcn is enabled.
	Added similar lowering to standard min/max HSAIL operations if gcn is disabled.

	6. Added lowering of fmaxnum/fminnum to more efficient gcn operations if gcn is enabled.
	Neither OpenCL nor LLVM IR semantics are violated by this.

	7. Moved GCN media intrinsics definitions into the GCN directory.

	8. Added folding of gcn f32 instructions min(max), min(min), max(max) into corresponding gcn
	instructions med3, min3 and max3. This should have been helpful for color clamping.
	Performance testing showed these are slow, however. T-Rex test from compubench has slowed down
	by 50 times for no obvious reason. Therefor folding is disabled by default. The option -enable-gcn-mm3
	is added to enable the folding for testing purposes.

	Testing: smoke, precheckin, luxmark, compubench, BasemarkCL,
	conformance: commonfns, bruteforce -w, relationals, select
	Reviewed by Brian Sumner

Affected files ...

... //depot/stg/opencl/drivers/opencl/compiler/lib/backends/common/codegen.cpp#68 edit
... //depot/stg/opencl/drivers/opencl/compiler/lib/backends/common/opt_level.cpp#29 edit
... //depot/stg/opencl/drivers/opencl/compiler/lib/utils/options.cpp#35 edit
... //depot/stg/opencl/drivers/opencl/compiler/llvm/lib/Target/HSAIL/GCN/HSAILArithmetic.td#3 edit
... //depot/stg/opencl/drivers/opencl/compiler/llvm/lib/Target/HSAIL/GCN/HSAILFusion.td#3 edit
... //depot/stg/opencl/drivers/opencl/compiler/llvm/lib/Target/HSAIL/GCN/HSAILIntrinsics.td#4 edit
... //depot/stg/opencl/drivers/opencl/compiler/llvm/lib/Target/HSAIL/HSAILArithmetic.td#45 edit
... //depot/stg/opencl/drivers/opencl/compiler/llvm/lib/Target/HSAIL/HSAILFusion.td#28 edit
... //depot/stg/opencl/drivers/opencl/compiler/llvm/lib/Target/HSAIL/HSAILISelDAGToDAG.cpp#68 edit
... //depot/stg/opencl/drivers/opencl/compiler/llvm/lib/Target/HSAIL/HSAILISelLowering.cpp#113 edit
... //depot/stg/opencl/drivers/opencl/compiler/llvm/lib/Target/HSAIL/HSAILInstrInfo.td#21 edit
... //depot/stg/opencl/drivers/opencl/compiler/llvm/lib/Target/HSAIL/HSAILIntrinsics.td#70 edit
... //depot/stg/opencl/drivers/opencl/tests/hsa/src/llc/opt/minmax/minmaxf3pat.cl#1 add
... //depot/stg/opencl/drivers/opencl/tests/hsa/tlst/llc_opt.tlst#93 edit
2016-05-02 16:43:00 -04:00

683 řádky
22 KiB
C++

//
// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
//
#include "top.hpp"
#include "codegen.hpp"
#include "utils/libUtils.h"
#include "os/os.hpp"
#include "utils/target_mappings.h"
#ifdef _MSC_VER
/* for disabling warning in llvm/ADT/Statistic.h */
#pragma warning(disable:4146)
#endif
#include "llvm/ADT/Statistic.h"
#ifdef _MSC_VER
#pragma warning(default:4146)
#endif
#if defined(LEGACY_COMPLIB)
#include "llvm/DataLayout.h"
#include "llvm/Module.h"
#include "llvm/ExecutionEngine/ObjectImage.h"
#else
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Module.h"
#include "llvm/Object/ObjectFile.h"
#endif
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/FormattedStream.h"
#include "llvm/Support/Host.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/ExecutionEngine/JITEventListener.h"
#include "llvm/ExecutionEngine/MCJIT.h"
#include <iostream>
#include <sstream>
#include <fstream>
#include <memory>
using namespace amdcl;
using namespace llvm;
//!--------------------------------------------------------------------------!//
// JIT Memory manager
//!--------------------------------------------------------------------------!//
OCLMCJITMemoryManager::~OCLMCJITMemoryManager() {
for (llvm::SmallVectorImpl<Allocation>::iterator
I = AllocatedCodeMem.begin(), E = AllocatedCodeMem.end();
I != E; ++I)
llvm::sys::Memory::releaseMappedMemory(I->first);
for (llvm::SmallVectorImpl<Allocation>::iterator
I = AllocatedDataMem.begin(), E = AllocatedDataMem.end();
I != E; ++I)
llvm::sys::Memory::releaseMappedMemory(I->first);
}
void
OCLMCJITMemoryManager::deallocateSection(uint8_t* BasePtr) {
for (llvm::SmallVectorImpl<Allocation>::iterator
I = AllocatedCodeMem.begin(), E = AllocatedCodeMem.end();
I != E; ++I)
if (I->first.base() == BasePtr) {
llvm::sys::Memory::releaseMappedMemory(I->first);
AllocatedCodeMem.erase(I);
return;
}
for (llvm::SmallVectorImpl<Allocation>::iterator
I = AllocatedDataMem.begin(), E = AllocatedDataMem.end();
I != E; ++I)
if (I->first.base() == BasePtr) {
llvm::sys::Memory::releaseMappedMemory(I->first);
AllocatedDataMem.erase(I);
return;
}
}
void OCLMCJITMemoryManager::reserveMemory(uint64_t Size) {
llvm::sys::MemoryBlock Block = allocateSection(Size);
AllocatedCodeMem.push_back(Allocation(Block, 64));
allocPtr = (uint8_t*)Block.base();
allocMaxPtr = allocPtr + Block.size();
}
uint8_t *OCLMCJITMemoryManager::
allocateCodeSection(uintptr_t Size, unsigned Alignment, unsigned SectionID
#if !defined(LEGACY_COMPLIB)
, llvm::StringRef SectionName
#endif
) {
// The recording memory manager is just a local copy of the remote target.
// The alignment requirement is just stored here for later use. Regular
// heap storage is sufficient here, but we're using mapped memory to work
// around a bug in MCJIT.
uint8_t* address = reservedAlloc(Size, Alignment);
if(address != NULL) {
return address;
} else {
llvm::sys::MemoryBlock Block = allocateSection(Size);
AllocatedCodeMem.push_back(Allocation(Block, Alignment));
return (uint8_t*)Block.base();
}
}
uint8_t *OCLMCJITMemoryManager::
allocateDataSection(uintptr_t Size, unsigned Alignment, unsigned SectionID,
#if !defined(LEGACY_COMPLIB)
llvm::StringRef SectionName,
#endif
bool isReadOnly) {
// The recording memory manager is just a local copy of the remote target.
// The alignment requirement is just stored here for later use. Regular
// heap storage is sufficient here, but we're using mapped memory to work
// around a bug in MCJIT.
uint8_t* address = reservedAlloc(Size, Alignment);
if(address != NULL) {
return address;
} else {
llvm::sys::MemoryBlock Block = allocateSection(Size);
AllocatedDataMem.push_back(Allocation(Block, Alignment));
return (uint8_t*)Block.base();
}
}
uint8_t * OCLMCJITMemoryManager::reservedAlloc(uintptr_t Size, unsigned Alignment) {
if(allocPtr != NULL) {
uint8_t *allocPtrAligned =
(uint8_t*)(((uintptr_t)allocPtr +
((uintptr_t)Alignment-1)) & ~((uintptr_t)Alignment-1));
uint8_t *allocPtrNext = allocPtrAligned + Size;
if(allocPtrNext < allocMaxPtr) {
allocPtr = allocPtrNext;
return allocPtrAligned;
}
}
return NULL;
}
llvm::sys::MemoryBlock OCLMCJITMemoryManager::allocateSection(uintptr_t Size) {
#if defined(LEGACY_COMPLIB)
llvm::error_code ec;
#else
std::error_code ec;
#endif
llvm::sys::MemoryBlock MB =
llvm::sys::Memory::allocateMappedMemory(Size,
&Near,
llvm::sys::Memory::MF_READ |
llvm::sys::Memory::MF_WRITE |
llvm::sys::Memory::MF_EXEC,
ec);
assert(!ec && MB.base());
// FIXME: This is part of a work around to keep sections near one another
// when MCJIT performs relocations after code emission but before
// the generated code is moved to the remote target.
// Save this address as the basis for our next request
Near = MB;
return MB;
}
#if !defined(LEGACY_COMPLIB)
void OCLMCJITMemoryManager::reserveAllocationSpace(uintptr_t CodeSize,
uintptr_t DataSizeRO,
uintptr_t DataSizeRW) {
uint64_t GOTTableReserveSize = 4096;
uint64_t Size = (uint64_t)CodeSize + (uint64_t)DataSizeRO +
(uint64_t)DataSizeRW + GOTTableReserveSize;
if ((uint64_t)allocPtr + (uint64_t)Size > (uint64_t)allocMaxPtr)
reserveMemory(Size);
}
#endif // !LEGACY_COMPLIB
void OCLMCJITMemoryManager::setMemoryWritable() {
assert(!"Unexpected");
}
void OCLMCJITMemoryManager::setMemoryExecutable() {
assert(!"Unexpected");
}
void OCLMCJITMemoryManager::setPoisonMemory(bool poison) {
assert(!"Unexpected");
}
void OCLMCJITMemoryManager::AllocateGOT() {
assert(!"Unexpected");
}
uint8_t *OCLMCJITMemoryManager::getGOTBase() const {
assert(!"Unexpected");
return 0;
}
uint8_t *OCLMCJITMemoryManager::startFunctionBody(const llvm::Function *F,
uintptr_t &ActualSize) {
assert(!"Unexpected");
return 0;
}
uint8_t *OCLMCJITMemoryManager::allocateStub(const llvm::GlobalValue* F,
unsigned StubSize,
unsigned Alignment) {
assert(!"Unexpected");
return 0;
}
void OCLMCJITMemoryManager::endFunctionBody(const llvm::Function *F,
uint8_t *FunctionStart,
uint8_t *FunctionEnd) {
assert(!"Unexpected");
}
uint8_t *OCLMCJITMemoryManager::allocateSpace(intptr_t Size,
unsigned Alignment) {
assert(!"Unexpected");
return 0;
}
uint8_t *OCLMCJITMemoryManager::allocateGlobal(uintptr_t Size,
unsigned Alignment) {
assert(!"Unexpected");
return 0;
}
void OCLMCJITMemoryManager::deallocateFunctionBody(void *Body) {
assert(!"Unexpected");
}
uint8_t* OCLMCJITMemoryManager::startExceptionTable(const llvm::Function* F,
uintptr_t &ActualSize) {
assert(!"Unexpected");
return 0;
}
void OCLMCJITMemoryManager::endExceptionTable(const llvm::Function *F,
uint8_t *TableStart,
uint8_t *TableEnd,
uint8_t* FrameRegister) {
assert(!"Unexpected");
}
void OCLMCJITMemoryManager::deallocateExceptionTable(void *ET) {
assert(!"Unexpected");
}
static int jit_noop() {
return 0;
}
void *OCLMCJITMemoryManager::getPointerToNamedFunction(const std::string &Name,
bool AbortOnFailure) {
// We should not invoke parent's ctors/dtors from generated main()!
// On Mingw and Cygwin, the symbol __main is resolved to
// callee's(eg. tools/lli) one, to invoke wrong duplicated ctors
// (and register wrong callee's dtors with atexit(3)).
// We expect ExecutionEngine::runStaticConstructorsDestructors()
// is called before ExecutionEngine::runFunctionAsMain() is called.
if (Name == "__main") return (void*)(intptr_t)&jit_noop;
return NULL;
}
//!--------------------------------------------------------------------------!//
// JIT Event Listener
//!--------------------------------------------------------------------------!//
class OclJITEventListener : public llvm::JITEventListener
{
private:
std::string* output_;
public:
OclJITEventListener(std::string &output) {
output_ = &output;
}
virtual void NotifyObjectEmitted
#if defined(LEGACY_COMPLIB)
(const llvm::ObjectImage &Obj)
#else
(const llvm::object::ObjectFile &Obj, const llvm::RuntimeDyld::LoadedObjectInfo &L)
#endif
override {
encodeObjectImage(Obj.getData(), *output_);
}
// Encoding and decoding are used to eliminate 0x00 ('\0') from the
// string so it is safe to use it as a null terminated c string.
// Translate:
// 0x00 -> 0xaa 0x55
// 0xaa -> 0xaa 0xaa
static void encodeObjectImage(std::string objectImage, std::string &encodedObjectImage) {
size_t length = objectImage.length();
for (size_t i = 0; i < length; ++i) {
unsigned char c = objectImage[i];
switch (c) {
case 0x00U:
encodedObjectImage.push_back(0xaaU);
encodedObjectImage.push_back(0x55U);
break;
case 0xaaU:
encodedObjectImage.push_back(0xaaU);
encodedObjectImage.push_back(0xaaU);
break;
default:
encodedObjectImage.push_back(c);
break;
}
}
}
// Translate:
// 0xaa 0x55 -> 0x00
// 0xaa 0xaa -> 0xaa
static void decodeObjectImage(std::string encodedObjectImage, std::string &decodedObjectImage) {
size_t length = encodedObjectImage.length();
for (size_t i = 0; i < length; ++i) {
unsigned char c = encodedObjectImage[i];
switch (c) {
case 0xaaU:
{
i = i + 1; // Increment to advance two characters
unsigned char cnext = encodedObjectImage[i];
if (cnext == 0xaaU) {
decodedObjectImage.push_back(0xaaU);
} else if (cnext == 0x55U) {
decodedObjectImage.push_back(0x00U);
} else {
assert(!"Bad encoding encountered");
}
}
break;
default:
decodedObjectImage.push_back(c);
break;
}
}
}
};
void decodeObjectImage(std::string encodedObjectImage, std::string &decodedObjectImage) {
OclJITEventListener::decodeObjectImage(encodedObjectImage, decodedObjectImage);
}
// Returns empty string if code generation was successful,
// otherwise the return string contains the error the MCJIT encountered.
std::string
jitCodeGen(llvm::Module* Composite,
llvm::TargetMachine* TargetMachine,
llvm::CodeGenOpt::Level OLvl,
std::string& output) {
std::string ErrStr;
OclJITEventListener Listener(output);
llvm::InitializeNativeTargetAsmParser();
llvm::InitializeNativeTargetAsmPrinter();
#if defined(LEGACY_COMPLIB)
OCLMCJITMemoryManager* MemMgr = new OCLMCJITMemoryManager();
llvm::EngineBuilder builder(Composite);
#else
std::unique_ptr<RTDyldMemoryManager> MemMgr(new OCLMCJITMemoryManager());
// FIXME: this llvm::Module* actually seems to be got from unique_ptr::get()
// somewhere, but acl functions use Module* instead of std::unique_ptr<llvm::Module>
// llvm::EngineBuilder does std::move on this pointer further so Module* can be
// deleted twice... llvm::EngineBuilder builder(Composite);
std::unique_ptr<llvm::Module> MPtr(Composite);
llvm::EngineBuilder builder(std::move(MPtr));
#endif
builder.setOptLevel(OLvl);
builder.setErrorStr(&ErrStr);
#if defined(LEGACY_COMPLIB)
builder.setJITMemoryManager(MemMgr);
builder.setUseMCJIT(true);
#else
builder.setMCJITMemoryManager(std::move(MemMgr));
#endif
// builder.setRelocationModel(llvm::Reloc::PIC_)
// builder.setCodeModel(llvm::CodeModel::Large)
#ifndef ANDROID
std::unique_ptr<llvm::ExecutionEngine>
TheExecutionEngine(builder.create(TargetMachine));
TheExecutionEngine->RegisterJITEventListener(&Listener);
TheExecutionEngine->finalizeObject();
TheExecutionEngine->removeModule(Composite);
#endif
return ErrStr;
}
int
llvmCodeGen(
Module* Composite,
amd::option::Options *OptionsObj,
std::string& output,
aclBinary* binary)
{
const FamilyMapping &familyMap = familySet[binary->target.arch_id];
const bool optimize = (OptionsObj ? (OptionsObj->oVariables->OptLevel > 0) : true);
const TargetMapping* targetMap = familyMap.target;
unsigned famID = binary->target.chip_id;
if (!targetMap || !targetMap[famID].supported) {
LogError("Device is not supported by code generator!");
return 1;
}
#if 1 || LLVM_TRUNK_INTEGRATION_CL >= 1463
#else
// a dirty way to guarantee "push bp" inserted by CodeGen in prologue
llvm::NoFramePointerElim = !optimize;
#endif
// Load the module to be compiled...
Module &mod = *Composite;
// FIXME: The triple given in this map is wrong and isn't really
// useful. Only need the architecture.
const std::string TargetTriple = std::string(familyMap.triple);
Triple TheTriple(TargetTriple);
if (TheTriple.getTriple().empty()) {
TheTriple.setTriple(sys::getDefaultTargetTriple());
}
Triple::ArchType arch = TheTriple.getArch();
bool isGPU = (arch == Triple::amdil || arch == Triple::amdil64 ||
arch == Triple::hsail || arch == Triple::hsail64);
if (isGPU) {
TheTriple.setOS(Triple::UnknownOS);
} else { // CPUs
// FIXME: This should come from somewhere else.
#ifdef __linux__
TheTriple.setOS(Triple::Linux);
#else
#if defined(LEGACY_COMPLIB)
TheTriple.setOS(Triple::MinGW32);
#else
TheTriple.setOS(Triple::Win32);
#endif
#endif
}
TheTriple.setEnvironment(Triple::AMDOpenCL);
// FIXME: need to make AMDOpenCL be the same as ELF
if (OptionsObj->oVariables->UseJIT)
#if defined(LEGACY_COMPLIB)
TheTriple.setEnvironment(Triple::ELF);
#else
TheTriple.setObjectFormat(Triple::ELF);
#endif
mod.setTargetTriple(TheTriple.getTriple());
// Allocate target machine. First, check whether the user has explicitly
// specified an architecture to compile for. If so we have to look it up by
// name, because it might be a backend that has no mapping to a target triple.
const Target *TheTarget = 0;
assert(binary->target.arch_id != aclError && "Cannot have the error device!");
std::string MArch = familyMap.architecture;
#ifdef WITH_TARGET_HSAIL
if (MArch == "hsail" && OptionsObj->oVariables->GPU64BitIsa) {
MArch = std::string("hsail64");
}
#endif
for (TargetRegistry::iterator it = TargetRegistry::begin(),
ie = TargetRegistry::end(); it != ie; ++it) {
if (MArch == it->getName()) {
TheTarget = &*it;
break;
}
}
if (!TheTarget) {
errs() << ": ERROR: invalid target '" << MArch << "'.\n";
return 1;
}
CodeGenOpt::Level OLvl = CodeGenOpt::None;
switch (OptionsObj->oVariables->OptLevel) {
case 0: // -O0
OLvl = CodeGenOpt::None;
break;
case 1: // -O1
OLvl = CodeGenOpt::Less;
break;
default:
assert(!"Error with optimization level");
case 2: // -O2
case 5: // -O5(-Os)
OLvl = CodeGenOpt::Default;
break;
case 3: // -O3
case 4: // -O4
OLvl = CodeGenOpt::Aggressive;
break;
};
// Adjust the triple to match (if known), otherwise stick with the
// module/host triple.
Triple::ArchType Type = Triple::getArchTypeForLLVMName(MArch);
if (Type != Triple::UnknownArch)
TheTriple.setArch(Type);
// Package up features to be passed to target/subtarget
std::string FeatureStr;
if ((Type == Triple::amdil || Type == Triple::amdil64) &&
targetMap[famID].chip_options) {
uint64_t y = targetMap[famID].chip_options;
for (uint64_t x = 0; y != 0; y >>= 1, ++x) {
if (!(y & 0x1) && (x >= 11 && x < 16)) {
continue;
}
if ((1 << x) == F_NO_ALIAS) {
FeatureStr += (!OptionsObj->oVariables->AssumeAlias ? '+' : '-');
} else if ((1 << x) == F_STACK_UAV) {
FeatureStr += (OptionsObj->oVariables->UseStackUAV ? '+' : '-');
} else if ((1 << x) == F_MACRO_CALL) {
FeatureStr += (OptionsObj->oVariables->UseMacroForCall ? '+' : '-');
} else if ((1 << x) == F_64BIT_PTR) {
FeatureStr += (binary->target.arch_id == aclAMDIL64) ? '+' : '-';
} else {
FeatureStr += ((y & 0x1) ? '+' : '-');
}
FeatureStr += GPUCodeGenFlagTable[x];
if (y != 0x1) {
FeatureStr += ',';
}
}
}
if (Type == Triple::amdil64) {
if (OptionsObj->oVariables->SmallGlobalObjects)
FeatureStr += ",+small-global-objects";
}
#if 1 || LLVM_TRUNK_INTEGRATION_CL >= 1463
llvm::TargetOptions targetOptions;
targetOptions.NoFramePointerElim = false;
targetOptions.StackAlignmentOverride =
OptionsObj->oVariables->CPUStackAlignment;
// jgolds
//targetOptions.EnableEBB = (optimize && OptionsObj->oVariables->CGEBB);
//targetOptions.EnableBFO = OptionsObj->oVariables->CGBFO;
//targetOptions.NoExcessFPPrecision = !OptionsObj->oVariables->EnableFMA;
// Don't allow unsafe optimizations for CPU because the library
// contains code that is not safe. See bug 9567.
if (isGPU)
targetOptions.UnsafeFPMath = OptionsObj->oVariables->UnsafeMathOpt;
targetOptions.LessPreciseFPMADOption = OptionsObj->oVariables->MadEnable ||
OptionsObj->oVariables->EnableMAD;
targetOptions.NoInfsFPMath = OptionsObj->oVariables->FiniteMathOnly;
// Need to add a support for OptionsObj->oVariables->NoSignedZeros,
targetOptions.NoNaNsFPMath = OptionsObj->oVariables->FiniteMathOnly;
std::auto_ptr<TargetMachine>
target(TheTarget->createTargetMachine(TheTriple.getTriple(),
aclutGetCodegenName(binary->target), FeatureStr, targetOptions,
WINDOWS_SWITCH(Reloc::DynamicNoPIC, Reloc::PIC_),
CodeModel::Default, OLvl));
#else
std::auto_ptr<TargetMachine>
target(TheTarget->createTargetMachine(TheTriple.getTriple(),
aclutGetCodegenName(binary->target), FeatureStr,
WINDOWS_SWITCH(Reloc::DynamicNoPIC, Reloc::PIC_),
CodeModel::Default));
assert(target.get() && "Could not allocate target machine!");
#endif
// MCJIT(Jan)
if(!isGPU && OptionsObj->oVariables->UseJIT) {
TargetMachine* jittarget(TheTarget->createTargetMachine(TheTriple.getTriple(),
aclutGetCodegenName(binary->target), FeatureStr, targetOptions,
WINDOWS_SWITCH(Reloc::DynamicNoPIC, Reloc::PIC_),
CodeModel::Default, OLvl));
std::string ErrStr = jitCodeGen(Composite, jittarget, OLvl, output);
if (!ErrStr.empty()) {
LogError("MCJIT failed to generate code");
LogError(ErrStr.c_str());
return 1;
}
return 0;
}
TargetMachine &Target = *target;
// Figure out where we are going to send the output...
raw_string_ostream *RSOut = new raw_string_ostream(output);
formatted_raw_ostream *Out = new formatted_raw_ostream(*RSOut, formatted_raw_ostream::DELETE_STREAM);
if (Out == 0) {
LogError("llvmCodeGen couldn't create an output stream");
return 1;
}
// Build up all of the passes that we want to do to the module or function or
// Basic Block.
PassManager Passes;
// Add the target data from the target machine, if it exists, or the module.
#if defined(LEGACY_COMPLIB)
if (const DataLayout *TD = Target.getDataLayout())
Passes.add(new DataLayout(*TD));
else
Passes.add(new DataLayout(&mod));
#else
Passes.add(new DataLayoutPass());
#endif
// Override default to generate verbose assembly, if the device is not the GPU.
// The GPU sets this in AMDILTargetMachine.cpp.
if (familyMap.target == (const TargetMapping*)&X86TargetMapping ||
familyMap.target == (const TargetMapping*)&X64TargetMapping
) {
Target.setAsmVerbosityDefault(true);
}
#ifdef WITH_TARGET_HSAIL
if (isHSAILTarget(binary->target)) {
if (Target.addPassesToEmitFile(Passes, *Out, TargetMachine::CGFT_ObjectFile, true)) {
delete Out;
return 1;
}
} else
#endif
{
#ifndef NDEBUG
#if 1 || LLVM_TRUNK_INTEGRATION_CL >= 1144
if (Target.addPassesToEmitFile(Passes, *Out, TargetMachine::CGFT_AssemblyFile, false))
#else
if (Target.addPassesToEmitFile(Passes, *Out, TargetMachine::CGFT_AssemblyFile, OLvl, false))
#endif
#else
#if 1 || LLVM_TRUNK_INTEGRATION_CL >= 1144
if (Target.addPassesToEmitFile(Passes, *Out, TargetMachine::CGFT_AssemblyFile, true))
#else
if (Target.addPassesToEmitFile(Passes, *Out, TargetMachine::CGFT_AssemblyFile, OLvl, true))
#endif
#endif
{
delete Out;
return 1;
}
}
Passes.run(mod);
llvm::PrintStatistics();
delete Out;
return 0;
}
int
CLCodeGen::codegen(llvm::Module *input)
{
uint64_t time_cg = 0ULL;
if (Options()->oVariables->EnableBuildTiming) {
time_cg = amd::Os::timeNanos();
}
llvmbinary_ = input;
amdcl::CompilerStage *cs = reinterpret_cast<amdcl::CompilerStage*>(this);
if (!isHSAILTarget(cs->Elf()->target)) {
setWholeProgram(true);
}
setUniformWorkGroupSize(Options()->oVariables->UniformWorkGroupSize);
int ret = llvmCodeGen(LLVMBinary(), Options(), Source(), Elf());
if (Options()->oVariables->EnableBuildTiming) {
time_cg = amd::Os::timeNanos() - time_cg;
std::stringstream tmp_ss;
tmp_ss << " LLVM CodeGen time: "
<< time_cg/1000ULL
<< "us\n";
appendLogToCL(CL(), tmp_ss.str());
}
if (!Source().empty() && Options()->isDumpFlagSet(amd::option::DUMP_CGIL)) {
std::string ilFileName = Options()->getDumpFileName(".il");
std::fstream f;
f.open(ilFileName.c_str(), (std::fstream::out | std::fstream::binary));
f.write(Source().data(), Source().length());
f.close();
}
return ret;
}