// // Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved. // #include "top.hpp" #include "codegen.hpp" #include "utils/libUtils.h" #include "os/os.hpp" #include "jit/src/jit.hpp" #include "utils/target_mappings.h" #ifdef _MSC_VER /* for disabling warning in llvm/ADT/Statistic.h */ #pragma warning(disable:4146) #endif #include "llvm/ADT/Statistic.h" #ifdef _MSC_VER #pragma warning(default:4146) #endif #include "llvm/Support/FormattedStream.h" #include "llvm/Support/Host.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Support/TargetSelect.h" #include "llvm/DataLayout.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/CommandLine.h" #include #include #include #include using namespace amdcl; using namespace llvm; static std::string aclGetCodegenName(const aclTargetInfo &tgtInfo) { assert(tgtInfo.arch_id <= aclLast && "Unknown device id!"); const FamilyMapping *family = familySet + tgtInfo.arch_id; if (!family) return ""; assert((tgtInfo.chip_id) < family->children_size && "Unknown family id!"); const TargetMapping *target = &family->target[tgtInfo.chip_id]; return (target) ? target->codegen_name : ""; } /*! Function that modifies the code gen level based on the * function size threshhold. */ static CodeGenOpt::Level AdjustCGOptLevel(Module& M, CodeGenOpt::Level OrigOLvl) { const unsigned int FuncSizeThreshold = 10000; if (OrigOLvl == CodeGenOpt::None) return OrigOLvl; for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { Function *F = (Function *)I; if (F->size() > FuncSizeThreshold) { return CodeGenOpt::None; } } return OrigOLvl; } int llvmCodeGen( Module* Composite, amd::option::Options *OptionsObj, std::string& output, aclBinary* binary) { const FamilyMapping &familyMap = familySet[binary->target.arch_id]; const bool optimize = (OptionsObj ? (OptionsObj->oVariables->OptLevel > 0) : true); const TargetMapping* targetMap = familyMap.target; unsigned famID = binary->target.chip_id; if (!targetMap || !targetMap[famID].supported) { LogError("Device is not supported by code generator!"); return 1; } #if 1 || LLVM_TRUNK_INTEGRATION_CL >= 1463 #else // a dirty way to guarantee "push bp" inserted by CodeGen in prologue llvm::NoFramePointerElim = !optimize; #endif // Load the module to be compiled... Module &mod = *Composite; // FIXME: The triple given in this map is wrong and isn't really // useful. Only need the architecture. const std::string TargetTriple = std::string(familyMap.triple); Triple TheTriple(TargetTriple); if (TheTriple.getTriple().empty()) { TheTriple.setTriple(sys::getDefaultTargetTriple()); } Triple::ArchType arch = TheTriple.getArch(); bool isGPU = (arch == Triple::amdil || arch == Triple::amdil64 || arch == Triple::hsail || arch == Triple::hsail64); if (isGPU) { TheTriple.setOS(Triple::UnknownOS); } else { // CPUs // FIXME: This should come from somewhere else. #ifdef __linux__ TheTriple.setOS(Triple::Linux); #else TheTriple.setOS(Triple::MinGW32); #endif } TheTriple.setEnvironment(Triple::AMDOpenCL); // FIXME: need to make AMDOpenCL be the same as ELF if (OptionsObj->oVariables->UseJIT) TheTriple.setEnvironment(Triple::ELF); mod.setTargetTriple(TheTriple.getTriple()); // Allocate target machine. First, check whether the user has explicitly // specified an architecture to compile for. If so we have to look it up by // name, because it might be a backend that has no mapping to a target triple. const Target *TheTarget = 0; assert(binary->target.arch_id != aclError && "Cannot have the error device!"); std::string MArch = familyMap.architecture; #ifdef WITH_TARGET_HSAIL if (MArch == "hsail" && OptionsObj->oVariables->GPU64BitIsa) { MArch = std::string("hsail-64"); } #endif for (TargetRegistry::iterator it = TargetRegistry::begin(), ie = TargetRegistry::end(); it != ie; ++it) { if (MArch == it->getName()) { TheTarget = &*it; break; } } if (!TheTarget) { errs() << ": ERROR: invalid target '" << MArch << "'.\n"; return 1; } CodeGenOpt::Level OLvl = CodeGenOpt::None; switch (OptionsObj->oVariables->OptLevel) { case 0: // -O0 OLvl = CodeGenOpt::None; break; case 1: // -O1 OLvl = CodeGenOpt::Less; break; default: assert(!"Error with optimization level"); case 2: // -O2 case 5: // -O5(-Os) OLvl = CodeGenOpt::Default; break; case 3: // -O3 case 4: // -O4 OLvl = CodeGenOpt::Aggressive; break; }; // If there is a very big function, lower the optimization level. OLvl = AdjustCGOptLevel(mod, OLvl); // Adjust the triple to match (if known), otherwise stick with the // module/host triple. Triple::ArchType Type = Triple::getArchTypeForLLVMName(MArch); if (Type != Triple::UnknownArch) TheTriple.setArch(Type); // Package up features to be passed to target/subtarget std::string FeatureStr; if ((Type == Triple::amdil || Type == Triple::amdil64) && targetMap[famID].chip_options) { uint64_t y = targetMap[famID].chip_options; for (uint64_t x = 0; y != 0; y >>= 1, ++x) { if (!(y & 0x1) && (x >= 11 && x < 16)) { continue; } if ((1 << x) == F_NO_ALIAS) { FeatureStr += (!OptionsObj->oVariables->AssumeAlias ? '+' : '-'); } else if ((1 << x) == F_STACK_UAV) { FeatureStr += (OptionsObj->oVariables->UseStackUAV ? '+' : '-'); } else if ((1 << x) == F_MACRO_CALL) { FeatureStr += (OptionsObj->oVariables->UseMacroForCall ? '+' : '-'); } else if ((1 << x) == F_64BIT_PTR) { FeatureStr += (binary->target.arch_id == aclAMDIL64) ? '+' : '-'; } else { FeatureStr += ((y & 0x1) ? '+' : '-'); } FeatureStr += GPUCodeGenFlagTable[x]; if (y != 0x1) { FeatureStr += ','; } } } if (Type == Triple::amdil64) { if (OptionsObj->oVariables->SmallGlobalObjects) FeatureStr += ",+small-global-objects"; } #if 1 || LLVM_TRUNK_INTEGRATION_CL >= 1463 llvm::TargetOptions targetOptions; targetOptions.NoFramePointerElim = false; targetOptions.StackAlignmentOverride = OptionsObj->oVariables->CPUStackAlignment; // jgolds //targetOptions.EnableEBB = (optimize && OptionsObj->oVariables->CGEBB); //targetOptions.EnableBFO = OptionsObj->oVariables->CGBFO; //targetOptions.NoExcessFPPrecision = !OptionsObj->oVariables->EnableFMA; // Don't allow unsafe optimizations for CPU because the library // contains code that is not safe. See bug 9567. if (isGPU) targetOptions.UnsafeFPMath = OptionsObj->oVariables->UnsafeMathOpt; targetOptions.LessPreciseFPMADOption = OptionsObj->oVariables->MadEnable || OptionsObj->oVariables->EnableMAD; targetOptions.NoInfsFPMath = OptionsObj->oVariables->FiniteMathOnly; // Need to add a support for OptionsObj->oVariables->NoSignedZeros, targetOptions.NoNaNsFPMath = OptionsObj->oVariables->FastRelaxedMath; std::auto_ptr target(TheTarget->createTargetMachine(TheTriple.getTriple(), aclGetCodegenName(binary->target), FeatureStr, targetOptions, WINDOWS_SWITCH(Reloc::DynamicNoPIC, Reloc::PIC_), CodeModel::Default, OLvl)); #else std::auto_ptr target(TheTarget->createTargetMachine(TheTriple.getTriple(), aclGetCodegenName(binary->target), FeatureStr, WINDOWS_SWITCH(Reloc::DynamicNoPIC, Reloc::PIC_), CodeModel::Default)); assert(target.get() && "Could not allocate target machine!"); #endif // MCJIT(Jan) if(!isGPU && OptionsObj->oVariables->UseJIT) { TargetMachine* jittarget(TheTarget->createTargetMachine(TheTriple.getTriple(), aclGetCodegenName(binary->target), FeatureStr, targetOptions, WINDOWS_SWITCH(Reloc::DynamicNoPIC, Reloc::PIC_), CodeModel::Default, OLvl)); std::string ErrStr = jitCodeGen(Composite, jittarget, OLvl, output); if (!ErrStr.empty()) { LogError("MCJIT failed to generate code"); LogError(ErrStr.c_str()); return 1; } return 0; } TargetMachine &Target = *target; // Figure out where we are going to send the output... raw_string_ostream *RSOut = new raw_string_ostream(output); formatted_raw_ostream *Out = new formatted_raw_ostream(*RSOut, formatted_raw_ostream::DELETE_STREAM); if (Out == 0) { LogError("llvmCodeGen couldn't create an output stream"); return 1; } // Build up all of the passes that we want to do to the module or function or // Basic Block. PassManager Passes; // Add the target data from the target machine, if it exists, or the module. if (const DataLayout *TD = Target.getDataLayout()) Passes.add(new DataLayout(*TD)); else Passes.add(new DataLayout(&mod)); // Override default to generate verbose assembly, if the device is not the GPU. // The GPU sets this in AMDILTargetMachine.cpp. if (familyMap.target == (const TargetMapping*)&X86TargetMapping || #if WITH_VERSION_0_9 familyMap.target == (const TargetMapping*)&A32TargetMapping || familyMap.target == (const TargetMapping*)&A32TargetMapping || #elif WITH_VERSION_0_8 #else #error "The current version implementation was not implemented here." #endif familyMap.target == (const TargetMapping*)&X64TargetMapping ) { Target.setAsmVerbosityDefault(true); } #ifdef WITH_TARGET_HSAIL if (isHSAILTarget(binary->target)) { if (Target.addPassesToEmitFile(Passes, *Out, TargetMachine::CGFT_ObjectFile, true)) { delete Out; return 1; } } else #endif { #ifndef NDEBUG #if 1 || LLVM_TRUNK_INTEGRATION_CL >= 1144 if (Target.addPassesToEmitFile(Passes, *Out, TargetMachine::CGFT_AssemblyFile, false)) #else if (Target.addPassesToEmitFile(Passes, *Out, TargetMachine::CGFT_AssemblyFile, OLvl, false)) #endif #else #if 1 || LLVM_TRUNK_INTEGRATION_CL >= 1144 if (Target.addPassesToEmitFile(Passes, *Out, TargetMachine::CGFT_AssemblyFile, true)) #else if (Target.addPassesToEmitFile(Passes, *Out, TargetMachine::CGFT_AssemblyFile, OLvl, true)) #endif #endif { delete Out; return 1; } } Passes.run(mod); llvm::PrintStatistics(); delete Out; return 0; } int CLCodeGen::codegen(llvm::Module *input) { uint64_t time_cg = 0ULL; if (Options()->oVariables->EnableBuildTiming) { time_cg = amd::Os::timeNanos(); } llvmbinary_ = input; amdcl::CompilerStage *cs = reinterpret_cast(this); if (!isHSAILTarget(cs->Elf()->target)) { setWholeProgram(true); } int ret = llvmCodeGen(LLVMBinary(), Options(), Source(), Elf()); if (Options()->oVariables->EnableBuildTiming) { time_cg = amd::Os::timeNanos() - time_cg; std::stringstream tmp_ss; tmp_ss << " LLVM CodeGen time: " << time_cg/1000ULL << "us\n"; appendLogToCL(CL(), tmp_ss.str()); } if (!Source().empty() && Options()->isDumpFlagSet(amd::option::DUMP_CGIL)) { std::string ilFileName = Options()->getDumpFileName(".il"); std::fstream f; f.open(ilFileName.c_str(), (std::fstream::out | std::fstream::binary)); f.write(Source().data(), Source().length()); f.close(); } return ret; }