// // Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. // #include "os/os.hpp" #include "utils/flags.hpp" #include "include/aclTypes.h" #include "utils/amdilUtils.hpp" #include "utils/bif_section_labels.hpp" #include "device/pal/palprogram.hpp" #include "device/pal/palblit.hpp" #include "macrodata.h" #include "MDParser/AMDILMDInterface.h" #include #include #include #include #include #include "utils/options.hpp" #include "hsa.h" #include "hsa_ext_image.h" #include "amd_hsa_loader.hpp" #if defined(WITH_LIGHTNING_COMPILER) #include "libamdhsacode/amdgpu_metadata.hpp" #include "driver/AmdCompiler.h" #include "libraries.amdgcn.inc" #include "gelf.h" #endif // !defined(WITH_LIGHTNING_COMPILER) namespace pal { Segment::Segment() : gpuAccess_(nullptr) , cpuAccess_(nullptr) {} Segment::~Segment() { delete gpuAccess_; if (cpuAccess_ != nullptr) { cpuAccess_->unmap(nullptr); delete cpuAccess_; } } bool Segment::alloc( HSAILProgram& prog, amdgpu_hsa_elf_segment_t segment, size_t size, size_t align, bool zero) { align = amd::alignUp(align, sizeof(uint32_t)); gpuAccess_ = new pal::Memory(prog.dev(), amd::alignUp(size, align)); if ((gpuAccess_ == nullptr) || !gpuAccess_->create(pal::Resource::Local)) { delete gpuAccess_; gpuAccess_ = nullptr; return false; } if (segment == AMDGPU_HSA_SEGMENT_CODE_AGENT) { cpuAccess_ = new pal::Memory(prog.dev(), amd::alignUp(size, align)); if ((cpuAccess_ == nullptr) || !cpuAccess_->create(pal::Resource::Remote)) { delete cpuAccess_; cpuAccess_ = nullptr; return false; } void* ptr = cpuAccess_->map(nullptr, 0); if (zero) { memset(ptr, 0, size); } } if (zero && !prog.isInternal()) { char pattern = 0; prog.dev().xferMgr().fillBuffer(*gpuAccess_, &pattern, sizeof(pattern), amd::Coord3D(0), amd::Coord3D(size)); } switch (segment) { case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM: case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT: case AMDGPU_HSA_SEGMENT_READONLY_AGENT: prog.addGlobalStore(gpuAccess_); prog.setGlobalVariableTotalSize(prog.globalVariableTotalSize() + size); break; case AMDGPU_HSA_SEGMENT_CODE_AGENT: prog.setCodeObjects(gpuAccess_, cpuAccess_->data()); break; default: break; } return true; } void Segment::copy(size_t offset, const void* src, size_t size) { if (cpuAccess_ != nullptr) { amd::Os::fastMemcpy(cpuAddress(offset), src, size); } else { VirtualGPU& gpu = *gpuAccess_->dev().xferQueue(); Memory& xferBuf = gpuAccess_->dev().xferWrite().acquire(); size_t tmpSize = std::min(static_cast(xferBuf.vmSize()), size); size_t srcOffs = 0; while (size != 0) { xferBuf.hostWrite(&gpu, reinterpret_cast(src) + srcOffs, 0, tmpSize); bool result = xferBuf.partialMemCopyTo(gpu, 0, (offset + srcOffs), tmpSize, *gpuAccess_, false, true); size -= tmpSize; srcOffs += tmpSize; tmpSize = std::min(static_cast(xferBuf.vmSize()), size); } gpu.releaseMemObjects(); gpu.waitAllEngines(); } } bool Segment::freeze(bool destroySysmem) { VirtualGPU& gpu = *gpuAccess_->dev().xferQueue(); bool result = true; if (cpuAccess_ != nullptr) { result = cpuAccess_->partialMemCopyTo(gpu, 0, 0, gpuAccess_->vmSize(), *gpuAccess_, false, true); gpu.releaseMemObjects(); gpu.waitAllEngines(); } assert(!destroySysmem || (cpuAccess_ == nullptr)); return result; } HSAILProgram::HSAILProgram(Device& device) : Program(device) , llvmBinary_() , binaryElf_(nullptr) , rawBinary_(nullptr) , kernels_(nullptr) , codeSegGpu_(nullptr) , codeSegCpu_(nullptr) , maxScratchRegs_(0) , flags_(0) , executable_(nullptr) , loaderContext_(this) { memset(&binOpts_, 0, sizeof(binOpts_)); binOpts_.struct_size = sizeof(binOpts_); binOpts_.elfclass = LP64_SWITCH(ELFCLASS32, ELFCLASS64); binOpts_.bitness = ELFDATA2LSB; binOpts_.alloc = &::malloc; binOpts_.dealloc = &::free; loader_ = amd::hsa::loader::Loader::Create(&loaderContext_); } HSAILProgram::HSAILProgram(NullDevice& device) : Program(device) , llvmBinary_() , binaryElf_(nullptr) , rawBinary_(nullptr) , kernels_(nullptr) , codeSegGpu_(nullptr) , codeSegCpu_(nullptr) , maxScratchRegs_(0) , flags_(0) , executable_(nullptr) , loaderContext_(this) { memset(&binOpts_, 0, sizeof(binOpts_)); binOpts_.struct_size = sizeof(binOpts_); binOpts_.elfclass = LP64_SWITCH(ELFCLASS32, ELFCLASS64); binOpts_.bitness = ELFDATA2LSB; binOpts_.alloc = &::malloc; binOpts_.dealloc = &::free; loader_ = amd::hsa::loader::Loader::Create(&loaderContext_); } HSAILProgram::~HSAILProgram() { // Destroy internal static samplers for (auto& it : staticSamplers_) { delete it; } #if !defined(WITH_LIGHTNING_COMPILER) if (rawBinary_ != nullptr) { aclFreeMem(binaryElf_, rawBinary_); } acl_error error; // Free the elf binary if (binaryElf_ != nullptr) { error = aclBinaryFini(binaryElf_); if (error != ACL_SUCCESS) { LogWarning( "Error while destroying the acl binary \n" ); } } #endif // !defined(WITH_LIGHTNING_COMPILER) releaseClBinary(); if (executable_ != nullptr) { loader_->DestroyExecutable(executable_); } delete kernels_; amd::hsa::loader::Loader::Destroy(loader_); } bool HSAILProgram::initBuild(amd::option::Options *options) { if (!device::Program::initBuild(options)) { return false; } const char* devName = dev().hwInfo()->machineTarget_; options->setPerBuildInfo( (devName && (devName[0] != '\0')) ? devName : "gpu", clBinary()->getEncryptCode(), true); // Elf Binary setup std::string outFileName; // true means fsail required clBinary()->init(options, true); if (options->isDumpFlagSet(amd::option::DUMP_BIF)) { outFileName = options->getDumpFileName(".bin"); } if (!clBinary()->setElfOut(LP64_SWITCH(ELFCLASS32, ELFCLASS64), (outFileName.size() > 0) ? outFileName.c_str() : nullptr)) { LogError("Setup elf out for gpu failed"); return false; } return true; } bool HSAILProgram::finiBuild(bool isBuildGood) { clBinary()->resetElfOut(); clBinary()->resetElfIn(); if (!isBuildGood) { // Prevent the encrypted binary form leaking out clBinary()->setBinary(nullptr, 0); } return device::Program::finiBuild(isBuildGood); } bool HSAILProgram::linkImpl( const std::vector &inputPrograms, amd::option::Options *options, bool createLibrary) { #if defined(WITH_LIGHTNING_COMPILER) assert(!"Should not reach here"); return false; #else // !defined(WITH_LIGHTNING_COMPILER) std::vector::const_iterator it = inputPrograms.begin(); std::vector::const_iterator itEnd = inputPrograms.end(); acl_error errorCode; // For each program we need to extract the LLVMIR and create // aclBinary for each std::vector binaries_to_link; for (size_t i = 0; it != itEnd; ++it, ++i) { HSAILProgram *program = (HSAILProgram *)*it; // Check if the program was created with clCreateProgramWIthBinary binary_t binary = program->binary(); if ((binary.first != nullptr) && (binary.second > 0)) { // Binary already exists -- we can also check if there is no // opencl source code // Need to check if LLVMIR exists in the binary // If LLVMIR does not exist then is it valid // We need to pull out all the compiled kernels // We cannot do this at present because we need at least // Hsail text to pull the kernels oout void *mem = const_cast(binary.first); binaryElf_ = aclReadFromMem(mem, binary.second, &errorCode); if (errorCode != ACL_SUCCESS) { LogWarning("Error while linking : Could not read from raw binary"); return false; } } // At this stage each HSAILProgram contains a valid binary_elf // Check if LLVMIR is in the binary // @TODO - Memory leak , cannot free this buffer // need to fix this.. File EPR on compiler library size_t llvmirSize = 0; const void *llvmirText = aclExtractSection(dev().compiler(), binaryElf_, &llvmirSize, aclLLVMIR, &errorCode); if (errorCode != ACL_SUCCESS) { bool spirv = false; size_t boolSize = sizeof(bool); errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_SPIRV, nullptr, &spirv, &boolSize); if (errorCode != ACL_SUCCESS) { spirv = false; } if (spirv) { errorCode = aclCompile(dev().compiler(), binaryElf_, options->origOptionStr.c_str(), ACL_TYPE_SPIRV_BINARY, ACL_TYPE_LLVMIR_BINARY, nullptr); buildLog_ += aclGetCompilerLog(dev().compiler()); if (errorCode != ACL_SUCCESS) { buildLog_ += "Error while linking: Could not load SPIR-V" ; return false; } } else { buildLog_ +="Error while linking : \ Invalid binary (Missing LLVMIR section)" ; return false; } } // Create a new aclBinary for each LLVMIR and save it in a list aclBIFVersion ver = aclBinaryVersion(binaryElf_); aclBinary *bin = aclCreateFromBinary(binaryElf_, ver); binaries_to_link.push_back(bin); } errorCode = aclLink(dev().compiler(), binaries_to_link[0], binaries_to_link.size() - 1, binaries_to_link.size() > 1 ? &binaries_to_link[1] : NULL, ACL_TYPE_LLVMIR_BINARY, "-create-library", NULL); if (errorCode != ACL_SUCCESS) { buildLog_ += aclGetCompilerLog(dev().compiler()); buildLog_ +="Error while linking : aclLink failed" ; return false; } // Store the newly linked aclBinary for this program. binaryElf_ = binaries_to_link[0]; // Free all the other aclBinaries for (size_t i = 1; i < binaries_to_link.size(); i++) { aclBinaryFini(binaries_to_link[i]); } if (createLibrary) { saveBinaryAndSetType(TYPE_LIBRARY); buildLog_ += aclGetCompilerLog(dev().compiler()); return true; } // Now call linkImpl with the new options return linkImpl(options); #endif // !defined(WITH_LIGHTNING_COMPILER) } aclType HSAILProgram::getCompilationStagesFromBinary(std::vector& completeStages, bool& needOptionsCheck) { #if defined(WITH_LIGHTNING_COMPILER) assert(!"Should not reach here"); return ACL_TYPE_DEFAULT; #else // !defined(WITH_LIGHTNING_COMPILER) acl_error errorCode; size_t secSize = 0; completeStages.clear(); aclType from = ACL_TYPE_DEFAULT; needOptionsCheck = true; size_t boolSize = sizeof(bool); // Checking llvmir in .llvmir section bool containsSpirv = true; errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_SPIRV, nullptr, &containsSpirv, &boolSize); if (errorCode != ACL_SUCCESS) { containsSpirv = false; } if (containsSpirv) { completeStages.push_back(from); from = ACL_TYPE_SPIRV_BINARY; } bool containsSpirText = true; errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_SPIR, nullptr, &containsSpirText, &boolSize); if (errorCode != ACL_SUCCESS) { containsSpirText = false; } if (containsSpirText) { completeStages.push_back(from); from = ACL_TYPE_SPIR_BINARY; } bool containsLlvmirText = true; errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_LLVMIR, nullptr, &containsLlvmirText, &boolSize); if (errorCode != ACL_SUCCESS) { containsLlvmirText = false; } // Checking compile & link options in .comment section bool containsOpts = true; errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_OPTIONS, nullptr, &containsOpts, &boolSize); if (errorCode != ACL_SUCCESS) { containsOpts = false; } if (containsLlvmirText && containsOpts) { completeStages.push_back(from); from = ACL_TYPE_LLVMIR_BINARY; } // Checking HSAIL in .cg section bool containsHsailText = true; errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_HSAIL, nullptr, &containsHsailText, &boolSize); if (errorCode != ACL_SUCCESS) { containsHsailText = false; } // Checking BRIG sections bool containsBrig = true; errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_BRIG, nullptr, &containsBrig, &boolSize); if (errorCode != ACL_SUCCESS) { containsBrig = false; } if (containsBrig) { completeStages.push_back(from); from = ACL_TYPE_HSAIL_BINARY; } else if (containsHsailText) { completeStages.push_back(from); from = ACL_TYPE_HSAIL_TEXT; } // Checking Loader Map symbol from CG section bool containsLoaderMap = true; errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_LOADER_MAP, nullptr, &containsLoaderMap, &boolSize); if (errorCode != ACL_SUCCESS) { containsLoaderMap = false; } if (containsLoaderMap) { completeStages.push_back(from); from = ACL_TYPE_CG; } // Checking ISA in .text section bool containsShaderIsa = true; errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_ISA, nullptr, &containsShaderIsa, &boolSize); if (errorCode != ACL_SUCCESS) { containsShaderIsa = false; } if (containsShaderIsa) { completeStages.push_back(from); from = ACL_TYPE_ISA; } std::string sCurOptions = compileOptions_ + linkOptions_; amd::option::Options curOptions; if (!amd::option::parseAllOptions(sCurOptions, curOptions)) { buildLog_ += curOptions.optionsLog(); LogError("Parsing compile options failed."); return ACL_TYPE_DEFAULT; } switch (from) { // compile from HSAIL text, no matter prev. stages and options case ACL_TYPE_HSAIL_TEXT: needOptionsCheck = false; break; case ACL_TYPE_HSAIL_BINARY: // do not check options, if LLVMIR is absent or might be absent or options are absent if (!curOptions.oVariables->BinLLVMIR || !containsLlvmirText || !containsOpts) { needOptionsCheck = false; } break; case ACL_TYPE_CG: case ACL_TYPE_ISA: // do not check options, if LLVMIR is absent or might be absent or options are absent if (!curOptions.oVariables->BinLLVMIR || !containsLlvmirText || !containsOpts) { needOptionsCheck = false; } // do not check options, if BRIG is absent or might be absent or LoaderMap is absent if (!curOptions.oVariables->BinCG || !containsBrig || !containsLoaderMap) { needOptionsCheck = false; } break; // recompilation might be needed case ACL_TYPE_LLVMIR_BINARY: case ACL_TYPE_DEFAULT: default: break; } return from; #endif // !defined(WITH_LIGHTNING_COMPILER) } aclType HSAILProgram::getNextCompilationStageFromBinary(amd::option::Options* options) { #if defined(WITH_LIGHTNING_COMPILER) assert(!"Should not reach here"); return ACL_TYPE_DEFAULT; #else // !defined(WITH_LIGHTNING_COMPILER) aclType continueCompileFrom = ACL_TYPE_DEFAULT; binary_t binary = this->binary(); // If the binary already exists if ((binary.first != nullptr) && (binary.second > 0)) { void *mem = const_cast(binary.first); acl_error errorCode; binaryElf_ = aclReadFromMem(mem, binary.second, &errorCode); if (errorCode != ACL_SUCCESS) { buildLog_ += "Error: Reading the binary from memory failed.\n"; return continueCompileFrom; } // Calculate the next stage to compile from, based on sections in binaryElf_; // No any validity checks here std::vector completeStages; bool needOptionsCheck = true; continueCompileFrom = getCompilationStagesFromBinary(completeStages, needOptionsCheck); // Saving binary in the interface class, // which also load compile & link options from binary setBinary(static_cast(mem), binary.second); if (!options || !needOptionsCheck) { return continueCompileFrom; } bool recompile = false; switch (continueCompileFrom) { case ACL_TYPE_HSAIL_BINARY: case ACL_TYPE_CG: case ACL_TYPE_ISA: { // Compare options loaded from binary with current ones, recompile if differ; // If compile options are absent in binary, do not compare and recompile if (compileOptions_.empty()) break; const oclBIFSymbolStruct* symbol = findBIF30SymStruct(symOpenclCompilerOptions); assert(symbol && "symbol not found"); std::string symName = std::string(symbol->str[bif::PRE]) + std::string(symbol->str[bif::POST]); size_t symSize = 0; const void *opts = aclExtractSymbol(dev().compiler(), binaryElf_, &symSize, aclCOMMENT, symName.c_str(), &errorCode); if (errorCode != ACL_SUCCESS) { recompile = true; break; } std::string sBinOptions = std::string((char*)opts, symSize); std::string sCurOptions = compileOptions_ + linkOptions_; amd::option::Options curOptions, binOptions; if (!amd::option::parseAllOptions(sBinOptions, binOptions)) { buildLog_ += binOptions.optionsLog(); LogError("Parsing compile options from binary failed."); return ACL_TYPE_DEFAULT; } if (!amd::option::parseAllOptions(sCurOptions, curOptions)) { buildLog_ += curOptions.optionsLog(); LogError("Parsing compile options failed."); return ACL_TYPE_DEFAULT; } if (!curOptions.equals(binOptions)) { recompile = true; } break; } default: break; } if (recompile) { while (!completeStages.empty()) { continueCompileFrom = completeStages.back(); if (continueCompileFrom == ACL_TYPE_SPIRV_BINARY || continueCompileFrom == ACL_TYPE_LLVMIR_BINARY || continueCompileFrom == ACL_TYPE_SPIR_BINARY || continueCompileFrom == ACL_TYPE_DEFAULT) { break; } completeStages.pop_back(); } } } return continueCompileFrom; #endif // !defined(WITH_LIGHTNING_COMPILER) } inline static std::vector splitSpaceSeparatedString(char *str) { std::string s(str); std::stringstream ss(s); std::istream_iterator beg(ss), end; std::vector vec(beg, end); return vec; } bool HSAILProgram::linkImpl(amd::option::Options* options) { #if defined(WITH_LIGHTNING_COMPILER) assert(!"Should not reach here"); return false; #else // !defined(WITH_LIGHTNING_COMPILER) acl_error errorCode; aclType continueCompileFrom = ACL_TYPE_LLVMIR_BINARY; bool finalize = true; bool hsaLoad = true; internal_ = (compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false; // If !binaryElf_ then program must have been created using clCreateProgramWithBinary if (!binaryElf_) { continueCompileFrom = getNextCompilationStageFromBinary(options); } switch (continueCompileFrom) { case ACL_TYPE_SPIRV_BINARY: case ACL_TYPE_SPIR_BINARY: // Compilation from ACL_TYPE_LLVMIR_BINARY to ACL_TYPE_CG in cases: // 1. if the program is not created with binary; // 2. if the program is created with binary and contains only .llvmir & .comment // 3. if the program is created with binary, contains .llvmir, .comment, brig sections, // but the binary's compile & link options differ from current ones (recompilation); case ACL_TYPE_LLVMIR_BINARY: // Compilation from ACL_TYPE_HSAIL_BINARY to ACL_TYPE_CG in cases: // 1. if the program is created with binary and contains only brig sections case ACL_TYPE_HSAIL_BINARY: // Compilation from ACL_TYPE_HSAIL_TEXT to ACL_TYPE_CG in cases: // 1. if the program is created with binary and contains only hsail text case ACL_TYPE_HSAIL_TEXT: { std::string curOptions = options->origOptionStr + hsailOptions(); errorCode = aclCompile(dev().compiler(), binaryElf_, curOptions.c_str(), continueCompileFrom, ACL_TYPE_CG, nullptr); buildLog_ += aclGetCompilerLog(dev().compiler()); if (errorCode != ACL_SUCCESS) { buildLog_ += "Error: BRIG code generation failed.\n"; return false; } break; } case ACL_TYPE_CG: break; case ACL_TYPE_ISA: finalize = false; break; default: buildLog_ += "Error: The binary is incorrect or incomplete. Finalization to ISA couldn't be performed.\n"; return false; } if (finalize) { std::string fin_options(options->origOptionStr + hsailOptions()); // Append an option so that we can selectively enable a SCOption on CZ // whenever IOMMUv2 is enabled. if (dev().settings().svmFineGrainSystem_) { fin_options.append(" -sc-xnack-iommu"); } errorCode = aclCompile(dev().compiler(), binaryElf_, fin_options.c_str(), ACL_TYPE_CG, ACL_TYPE_ISA, nullptr); buildLog_ += aclGetCompilerLog(dev().compiler()); if (errorCode != ACL_SUCCESS) { buildLog_ += "Error: BRIG finalization to ISA failed.\n"; return false; } } // ACL_TYPE_CG stage is not performed for offline compilation hsa_agent_t agent; agent.handle = 1; if (hsaLoad) { executable_ = loader_->CreateExecutable(HSA_PROFILE_FULL, NULL); if (executable_ == nullptr) { buildLog_ += "Error: Executable for AMD HSA Code Object isn't created.\n"; return false; } size_t size = 0; hsa_code_object_t code_object; code_object.handle = reinterpret_cast(aclExtractSection(dev().compiler(), binaryElf_, &size, aclTEXT, &errorCode)); if (errorCode != ACL_SUCCESS) { buildLog_ += "Error: Extracting AMD HSA Code Object from binary failed.\n"; return false; } hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr); if (status != HSA_STATUS_SUCCESS) { buildLog_ += "Error: AMD HSA Code Object loading failed.\n"; return false; } status = executable_->Freeze(nullptr); if (status != HSA_STATUS_SUCCESS) { buildLog_ += "Error: AMD HSA Code Object freeze failed.\n"; return false; } } size_t kernelNamesSize = 0; errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr, nullptr, &kernelNamesSize); if (errorCode != ACL_SUCCESS) { buildLog_ += "Error: Querying of kernel names size from the binary failed.\n"; return false; } if (kernelNamesSize > 0) { char* kernelNames = new char[kernelNamesSize]; errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr, kernelNames, &kernelNamesSize); if (errorCode != ACL_SUCCESS) { buildLog_ += "Error: Querying of kernel names from the binary failed.\n"; delete kernelNames; return false; } std::vector vKernels = splitSpaceSeparatedString(kernelNames); delete kernelNames; std::vector::iterator it = vKernels.begin(); bool dynamicParallelism = false; for (it; it != vKernels.end(); ++it) { std::string kernelName(*it); std::string openclKernelName = device::Kernel::openclMangledName(kernelName); HSAILKernel *aKernel = new HSAILKernel(kernelName, this, options->origOptionStr + hsailOptions()); kernels()[kernelName] = aKernel; amd::hsa::loader::Symbol *sym = executable_->GetSymbol("", openclKernelName.c_str(), agent, 0); if (!sym) { buildLog_ += "Error: Getting kernel ISA code symbol '" + openclKernelName + "' from AMD HSA Code Object failed. Kernel initialization failed.\n"; return false; } if (!aKernel->init(sym, false)) { buildLog_ += "Error: Kernel '" + openclKernelName + "' initialization failed.\n"; return false; } buildLog_ += aKernel->buildLog(); aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize); dynamicParallelism |= aKernel->dynamicParallelism(); // Find max scratch regs used in the program. It's used for scratch buffer preallocation // with dynamic parallelism, since runtime doesn't know which child kernel will be called maxScratchRegs_ = std::max(static_cast(aKernel->workGroupInfo()->scratchRegs_), maxScratchRegs_); } // Allocate kernel table for device enqueuing if (!isNull() && dynamicParallelism && !allocKernelTable()) { return false; } } // Save the binary in the interface class saveBinaryAndSetType(TYPE_EXECUTABLE); buildLog_ += aclGetCompilerLog(dev().compiler()); return true; #endif // !defined(WITH_LIGHTNING_COMPILER) } bool HSAILProgram::createBinary(amd::option::Options *options) { return true; } bool HSAILProgram::initClBinary() { if (clBinary_ == nullptr) { clBinary_ = new ClBinaryHsa(static_cast(device())); if (clBinary_ == nullptr) { return false; } } return true; } void HSAILProgram::releaseClBinary() { if (clBinary_ != nullptr) { delete clBinary_; clBinary_ = nullptr; } } std::string HSAILProgram::hsailOptions() { std::string hsailOptions; // Set options for the standard device specific options // All our devices support these options now if (dev().settings().reportFMAF_) { hsailOptions.append(" -DFP_FAST_FMAF=1"); } if (dev().settings().reportFMA_) { hsailOptions.append(" -DFP_FAST_FMA=1"); } #if !defined(WITH_LIGHTNING_COMPILER) if (!dev().settings().singleFpDenorm_) { hsailOptions.append(" -cl-denorms-are-zero"); } #endif // !defined(WITH_LIGHTNING_COMPILER) // Check if the host is 64 bit or 32 bit LP64_ONLY(hsailOptions.append(" -m64")); // Append each extension supported by the device std::string token; std::istringstream iss(""); iss.str(device().info().extensions_); while (getline(iss, token, ' ')) { if (!token.empty()) { hsailOptions.append(" -D"); hsailOptions.append(token); hsailOptions.append("=1"); } } return hsailOptions; } bool HSAILProgram::allocKernelTable() { uint size = kernels().size() * sizeof(size_t); kernels_ = new pal::Memory(dev(), size); // Initialize kernel table if ((kernels_ == nullptr) || !kernels_->create(Resource::RemoteUSWC)) { delete kernels_; return false; } else { size_t* table = reinterpret_cast( kernels_->map(nullptr, pal::Resource::WriteOnly)); for (auto& it : kernels()) { HSAILKernel* kernel = static_cast(it.second); table[kernel->index()] = static_cast(kernel->gpuAqlCode()); } kernels_->unmap(nullptr); } return true; } void HSAILProgram::fillResListWithKernels( std::vector& memList) const { memList.push_back(&codeSegGpu()); } const aclTargetInfo & HSAILProgram::info(const char * str) { #if defined(WITH_LIGHTNING_COMPILER) assert(!"Should not reach here"); #else // !defined(WITH_LIGHTNING_COMPILER) acl_error err; std::string arch = "hsail"; if (dev().settings().use64BitPtr_) { arch = "hsail64"; } info_ = aclGetTargetInfo(arch.c_str(), ( str && str[0] == '\0' ? dev().hwInfo()->targetName_ : str ), &err); if (err != ACL_SUCCESS) { LogWarning("aclGetTargetInfo failed"); } #endif // !defined(WITH_LIGHTNING_COMPILER) return info_; } bool HSAILProgram::saveBinaryAndSetType(type_t type) { #if defined(WITH_LIGHTNING_COMPILER) assert(!"Should not reach here"); #else // !defined(WITH_LIGHTNING_COMPILER) //Write binary to memory if (rawBinary_ != nullptr) { //Free memory containing rawBinary aclFreeMem(binaryElf_, rawBinary_); rawBinary_ = nullptr; } size_t size = 0; if (aclWriteToMem(binaryElf_, &rawBinary_, &size) != ACL_SUCCESS) { buildLog_ += "Failed to write binary to memory \n"; return false; } setBinary(static_cast(rawBinary_), size); //Set the type of binary setType(type); #endif // !defined(WITH_LIGHTNING_COMPILER) return true; } hsa_isa_t PALHSALoaderContext::IsaFromName(const char *name) { hsa_isa_t isa = {0}; if (!strcmp(Gfx700, name)) { isa.handle = gfx700; return isa; } if (!strcmp(Gfx701, name)) { isa.handle = gfx701; return isa; } if (!strcmp(Gfx800, name)) { isa.handle = gfx800; return isa; } if (!strcmp(Gfx801, name)) { isa.handle = gfx801; return isa; } if (!strcmp(Gfx804, name)) { isa.handle = gfx804; return isa; } if (!strcmp(Gfx810, name)) { isa.handle = gfx810; return isa; } if (!strcmp(Gfx900, name)) { isa.handle = gfx900; return isa; } if (!strcmp(Gfx901, name)) { isa.handle = gfx901; return isa; } return isa; } bool PALHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) { switch (program_->dev().hwInfo()->gfxipVersion_) { default: LogError("Unsupported gfxip version"); return false; case gfx700: case gfx701: case gfx702: // gfx701 only differs from gfx700 by faster fp operations and can be loaded on either device. return isa.handle == gfx700 || isa.handle == gfx701; case gfx800: return isa.handle == gfx800; case gfx801: return isa.handle == gfx801; case gfx804: // gfx800 ISA has only sgrps limited and can be loaded. // gfx801 ISA has XNACK limitations and can be loaded. return isa.handle == gfx800 || isa.handle == gfx801 || isa.handle == gfx804; case gfx810: return isa.handle == gfx810; case gfx900: case gfx901: return isa.handle == gfx900 || isa.handle == gfx901; } } void* PALHSALoaderContext::SegmentAlloc( amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, size_t size, size_t align, bool zero) { assert(size); assert(align); if (program_->isNull()) { void* ptr = amd::Os::alignedMalloc(size, align); if (zero) { memset(ptr, 0, size); } return ptr; } Segment* seg = new Segment(); if (seg != nullptr && !seg->alloc(*program_, segment, size, align, zero)) { return nullptr; } return seg; } bool PALHSALoaderContext::SegmentCopy(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* dst, size_t offset, const void* src, size_t size) { if (program_->isNull()) { amd::Os::fastMemcpy(reinterpret_cast
(dst) + offset, src, size); return true; } Segment* s = reinterpret_cast(dst); s->copy(offset, src, size); return true; } void PALHSALoaderContext::SegmentFree( amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size) { if (program_->isNull()) { amd::Os::alignedFree(seg); } else { Segment* s = reinterpret_cast(seg); delete s ; } } void* PALHSALoaderContext::SegmentAddress( amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t offset) { assert(seg); if (program_->isNull()) { return (reinterpret_cast
(seg) + offset); } Segment* s = reinterpret_cast(seg); return reinterpret_cast(s->gpuAddress(offset)); } void* PALHSALoaderContext::SegmentHostAddress( amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t offset) { assert(seg); if (program_->isNull()) { return (reinterpret_cast
(seg) + offset); } Segment* s = reinterpret_cast(seg); return s ->cpuAddress(offset); } bool PALHSALoaderContext::SegmentFreeze( amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size) { if (program_->isNull()) { return true; } Segment* s = reinterpret_cast(seg); return s->freeze((segment == AMDGPU_HSA_SEGMENT_CODE_AGENT) ? false : true); } hsa_status_t PALHSALoaderContext::SamplerCreate( hsa_agent_t agent, const hsa_ext_sampler_descriptor_t *sampler_descriptor, hsa_ext_sampler_t *sampler_handle) { if (!agent.handle) { return HSA_STATUS_ERROR_INVALID_AGENT; } if (!sampler_descriptor || !sampler_handle) { return HSA_STATUS_ERROR_INVALID_ARGUMENT; } if (program_->isNull()) { // Offline compilation. Provide a fake handle to avoid an assert sampler_handle->handle = 1; return HSA_STATUS_SUCCESS; } uint32_t state = 0; switch (sampler_descriptor->coordinate_mode) { case HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED: state = amd::Sampler::StateNormalizedCoordsFalse; break; case HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED: state = amd::Sampler::StateNormalizedCoordsTrue; break; default: assert(false); return HSA_STATUS_ERROR_INVALID_ARGUMENT; } switch (sampler_descriptor->filter_mode) { case HSA_EXT_SAMPLER_FILTER_MODE_NEAREST: state |= amd::Sampler::StateFilterNearest; break; case HSA_EXT_SAMPLER_FILTER_MODE_LINEAR: state |= amd::Sampler::StateFilterLinear; break; default: assert(false); return HSA_STATUS_ERROR_INVALID_ARGUMENT; } switch (sampler_descriptor->address_mode) { case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE: state |= amd::Sampler::StateAddressClampToEdge; break; case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER: state |= amd::Sampler::StateAddressClamp; break; case HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT: state |= amd::Sampler::StateAddressRepeat; break; case HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT: state |= amd::Sampler::StateAddressMirroredRepeat; break; case HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED: state |= amd::Sampler::StateAddressNone; break; default: assert(false); return HSA_STATUS_ERROR_INVALID_ARGUMENT; } pal::Sampler* sampler = new pal::Sampler(program_->dev()); if (!sampler || !sampler->create(state)) { delete sampler; return HSA_STATUS_ERROR; } program_->addSampler(sampler); sampler_handle->handle = sampler->hwSrd(); return HSA_STATUS_SUCCESS; } hsa_status_t PALHSALoaderContext::SamplerDestroy( hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) { if (!agent.handle) { return HSA_STATUS_ERROR_INVALID_AGENT; } if (!sampler_handle.handle) { return HSA_STATUS_ERROR_INVALID_ARGUMENT; } return HSA_STATUS_SUCCESS; } #if defined(WITH_LIGHTNING_COMPILER) static hsa_status_t GetKernelNamesCallback( hsa_executable_t hExec, hsa_executable_symbol_t hSymbol, void *data) { auto symbol = Symbol::Object(hSymbol); auto symbolNameList = reinterpret_cast*>(data); hsa_symbol_kind_t type; if (!symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &type)) { return HSA_STATUS_ERROR; } if (type == HSA_SYMBOL_KIND_KERNEL) { uint32_t length; if (!symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &length)) { return HSA_STATUS_ERROR; } char* name = (char*) alloca(length+1); if (!symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_NAME, name)) { return HSA_STATUS_ERROR; } name[length] = '\0'; symbolNameList->push_back(std::string(name)); } return HSA_STATUS_SUCCESS; } bool LightningProgram::linkImpl(amd::option::Options *options) { using namespace amd::opencl_driver; internal_ = (compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false; aclType continueCompileFrom = llvmBinary_.empty() ? getNextCompilationStageFromBinary(options) : ACL_TYPE_LLVMIR_BINARY; if (continueCompileFrom == ACL_TYPE_ISA) { binary_t isa = binary(); if ((isa.first != NULL) && (isa.second > 0)) { return setKernels(options, (void*) isa.first, isa.second ); } else { buildLog_ += "Error: code object is empty \n" ; return false; } return true; } if (continueCompileFrom != ACL_TYPE_LLVMIR_BINARY) { buildLog_ += "Error while Codegen phase: the binary is incomplete \n" ; return false; } std::auto_ptr C(newCompilerInstance()); // call LinkLLVMBitcode std::vector inputs; // open the input IR source Data* input = C->NewBufferReference( DT_LLVM_BC, llvmBinary_.data(), llvmBinary_.size()); if (!input) { buildLog_ += "Error: Failed to open the compiled program.\n"; return false; } inputs.push_back(input); //< must be the first input // open the bitcode libraries Data* opencl_bc = C->NewBufferReference(DT_LLVM_BC, (const char*) opencl_amdgcn, opencl_amdgcn_size); Data* ocml_bc = C->NewBufferReference(DT_LLVM_BC, (const char*) ocml_amdgcn, ocml_amdgcn_size); Data* ockl_bc = C->NewBufferReference(DT_LLVM_BC, (const char*) ockl_amdgcn, ockl_amdgcn_size); Data* irif_bc = C->NewBufferReference(DT_LLVM_BC, (const char*) irif_amdgcn, irif_amdgcn_size); if (!opencl_bc || !ocml_bc || !ockl_bc || !irif_bc) { buildLog_ += "Error: Failed to open the bitcode library.\n"; return false; } inputs.push_back(opencl_bc); // depends on oclm & ockl inputs.push_back(ockl_bc); // depends on irif inputs.push_back(ocml_bc); // depends on irif inputs.push_back(irif_bc); // open the control functions auto isa_version = get_oclc_isa_version(dev().hwInfo()->gfxipVersion_); if (!isa_version.first) { buildLog_ += "Error: Linking for this device is not supported\n"; return false; } Data* isa_version_bc = C->NewBufferReference(DT_LLVM_BC, (const char*) isa_version.first, isa_version.second); if (!isa_version_bc) { buildLog_ += "Error: Failed to open the control functions.\n"; return false; } inputs.push_back(isa_version_bc); auto correctly_rounded_sqrt = get_oclc_correctly_rounded_sqrt( options->oVariables->FP32RoundDivideSqrt); Data* correctly_rounded_sqrt_bc = C->NewBufferReference(DT_LLVM_BC, correctly_rounded_sqrt.first, correctly_rounded_sqrt.second); auto daz_opt = get_oclc_daz_opt(dev().hwInfo()->gfxipVersion_ < 900 || options->oVariables->DenormsAreZero); Data* daz_opt_bc = C->NewBufferReference(DT_LLVM_BC, daz_opt.first, daz_opt.second); auto finite_only = get_oclc_finite_only(options->oVariables->FiniteMathOnly || options->oVariables->FastRelaxedMath); Data* finite_only_bc = C->NewBufferReference(DT_LLVM_BC, finite_only.first, finite_only.second); auto unsafe_math = get_oclc_unsafe_math(options->oVariables->UnsafeMathOpt || options->oVariables->FastRelaxedMath); Data* unsafe_math_bc = C->NewBufferReference(DT_LLVM_BC, unsafe_math.first, unsafe_math.second); if (!correctly_rounded_sqrt_bc || !daz_opt_bc || !finite_only_bc || !unsafe_math_bc) { buildLog_ += "Error: Failed to open the control functions.\n"; return false; } if (!correctly_rounded_sqrt_bc || !daz_opt_bc || !finite_only_bc || !unsafe_math_bc) { buildLog_ += "Error: Failed to open the control functions.\n"; return false; } inputs.push_back(correctly_rounded_sqrt_bc); inputs.push_back(daz_opt_bc); inputs.push_back(finite_only_bc); inputs.push_back(unsafe_math_bc); // open the linked output std::vector linkOptions; amd::opencl_driver::Buffer* linked_bc = C->NewBuffer(DT_LLVM_BC); if (!linked_bc) { buildLog_ += "Error: Failed to open the linked program.\n"; return false; } bool ret = C->LinkLLVMBitcode(inputs, linked_bc, linkOptions); buildLog_ += C->Output(); if (!ret) { buildLog_ += "Error: Linking bitcode failed: linking source & IR libraries.\n"; return false; } if (options->isDumpFlagSet(amd::option::DUMP_BC_LINKED)) { std::ofstream f(options->getDumpFileName("_linked.bc").c_str(), std::ios::trunc); if(f.is_open()) { f.write(linked_bc->Buf().data(), linked_bc->Size()); } else { buildLog_ += "Warning: opening the file to dump the linked IR failed.\n"; } } inputs.clear(); inputs.push_back(linked_bc); amd::opencl_driver::Buffer* out_exec = C->NewBuffer(DT_EXECUTABLE); if (!out_exec) { buildLog_ += "Error: Failed to create the linked executable.\n"; return false; } std::string codegenOptions(options->llvmOptions); // Set the machine target std::ostringstream mCPU; mCPU << " -mcpu=gfx" << dev().hwInfo()->gfxipVersion_; codegenOptions.append(mCPU.str()); // Set the -O# std::ostringstream optLevel; optLevel << "-O" << options->oVariables->OptLevel; codegenOptions.append(" ").append(optLevel.str()); // Tokenize the options string into a vector of strings std::istringstream strstr(codegenOptions); std::istream_iterator sit(strstr), end; std::vector params(sit, end); ret = C->CompileAndLinkExecutable(inputs, out_exec, params); buildLog_ += C->Output(); if (!ret) { buildLog_ += "Error: Creating the executable failed: Compiling LLVM IRs to exe.\n"; return false; } if (options->isDumpFlagSet(amd::option::DUMP_O)) { std::ofstream f(options->getDumpFileName(".so").c_str(), std::ios::trunc); if(f.is_open()) { f.write(out_exec->Buf().data(), out_exec->Size()); } else { buildLog_ += "Warning: opening the file to dump the code object failed.\n"; } } return setKernels(options, out_exec->Buf().data(), out_exec->Size()); } bool LightningProgram::setKernels( amd::option::Options *options, void* binary, size_t size ) { hsa_agent_t agent; agent.handle = 1; executable_ = loader_->CreateExecutable(HSA_PROFILE_FULL, NULL); if (executable_ == nullptr) { buildLog_ += "Error: Executable for AMD HSA Code Object isn't created.\n"; return false; } hsa_code_object_t code_object; code_object.handle = reinterpret_cast(binary); hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr); if (status != HSA_STATUS_SUCCESS) { buildLog_ += "Error: AMD HSA Code Object loading failed.\n"; return false; } status = executable_->Freeze(nullptr); if (status != HSA_STATUS_SUCCESS) { buildLog_ += "Error: Freezing the executable failed: "; return false; } size_t progvarsTotalSize = 0; // Begin the Elf image from memory Elf* e = elf_memory((char*) binary, size, NULL); if (elf_kind(e) != ELF_K_ELF) { buildLog_ += "Error while reading the ELF program binary\n"; return false; } size_t shstrndx; if (elf_getshdrstrndx(e, &shstrndx) != 0) { buildLog_ += "Error while reading the ELF program binary\n"; return false; } // Iterate over the sections for (Elf_Scn* scn = elf_nextscn(e, 0); scn; scn = elf_nextscn(e, scn)) { GElf_Shdr shdr; if (gelf_getshdr(scn, &shdr) != &shdr) { continue; } // Skip non-program sections if (shdr.sh_type != SHT_PROGBITS) { continue; } // Accumulate the size of A & !X sections if ((shdr.sh_flags & SHF_ALLOC) && !(shdr.sh_flags & SHF_EXECINSTR)) { progvarsTotalSize += shdr.sh_size; } // Check if this is the metadata section const char* name = elf_strptr(e, shstrndx , shdr.sh_name); if (name && !strcmp(name, ".AMDGPU.runtime_metadata")) { // Assume a single Elf_Data, the parser will fail if it isn't Elf_Data* data = elf_getdata(scn, NULL); if (!data) { buildLog_ += "Error while reading ELF program binary " \ "runtime metadata section\n"; return false; } metadata_ = new amd::hsa::code::Program::Metadata(); if (!metadata_ || !metadata_->ReadFrom(data->d_buf, data->d_size)) { buildLog_ += "Error while parsing ELF program binary " \ "runtime metadata section\n"; return false; } } } elf_end(e); if (!metadata_) { buildLog_ += "Error: runtime metadata section not present in " \ "ELF program binary\n"; return false; } setGlobalVariableTotalSize(progvarsTotalSize); // Get the list of kernels std::vector kernelNameList; status = executable_->IterateSymbols(GetKernelNamesCallback, (void *) &kernelNameList ); if (status != HSA_STATUS_SUCCESS) { buildLog_ += "Error: Failed to get kernel names\n"; return false; } for (auto &kernelName : kernelNameList) { auto kernel = new LightningKernel( kernelName, this, options->origOptionStr + hsailOptions()); kernels()[kernelName] = kernel; auto symbol = executable_->GetSymbol("", kernelName.c_str(), agent, 0); if (!symbol) { buildLog_ += "Error: Getting kernel symbol '" + kernelName + "' from AMD HSA Code Object failed. " \ "Kernel initialization failed.\n"; return false; } if (!kernel->init(symbol)) { buildLog_ += "Error: Kernel '" + kernelName + "' initialization failed.\n"; return false; } buildLog_ += kernel->buildLog(); kernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize); // Find max scratch regs used in the program. It's used for scratch buffer preallocation // with dynamic parallelism, since runtime doesn't know which child kernel will be called maxScratchRegs_ = std::max(static_cast(kernel->workGroupInfo()->scratchRegs_), maxScratchRegs_); } // Allocate kernel table for device enqueuing if (!isNull() && false/*dynamicParallelism*/ && !allocKernelTable()) { return false; } // Save the binary and type clBinary()->saveBIFBinary((char*)binary, size); setType(TYPE_EXECUTABLE); return true; } LightningProgram::~LightningProgram() { delete metadata_; } #endif // defined(WITH_LIGHTNING_COMPILER) } // namespace pal