dee2f4bd71
SWDEV-79445 - OCL generic changes and code clean-up Program compilation clean-up. Step#3: - Move getCompilationStagesFromBinary and getNextCompilationStageFromBinary to the abstraction layer. - Share the same functionality across GSL, PAL and ROCr backends Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/devprogram.cpp#3 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/devprogram.hpp#3 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuprogram.cpp#243 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuprogram.hpp#76 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldefs.hpp#39 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#75 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.hpp#31 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocprogram.cpp#88 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocprogram.hpp#37 edit
1284 righe
43 KiB
C++
1284 righe
43 KiB
C++
//
|
|
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
|
|
//
|
|
#include "os/os.hpp"
|
|
#include "utils/flags.hpp"
|
|
#include "include/aclTypes.h"
|
|
#include "utils/amdilUtils.hpp"
|
|
#include "utils/bif_section_labels.hpp"
|
|
#include "device/pal/palprogram.hpp"
|
|
#include "device/pal/palblit.hpp"
|
|
#include "macrodata.h"
|
|
#include "MDParser/AMDILMDInterface.h"
|
|
#include <fstream>
|
|
#include <sstream>
|
|
#include <cstdio>
|
|
#include <algorithm>
|
|
#include <iterator>
|
|
#include "utils/options.hpp"
|
|
#include "hsa.h"
|
|
#include "hsa_ext_image.h"
|
|
#include "amd_hsa_loader.hpp"
|
|
#if defined(WITH_LIGHTNING_COMPILER)
|
|
#include "llvm/Support/AMDGPUMetadata.h"
|
|
#include "driver/AmdCompiler.h"
|
|
#include "libraries.amdgcn.inc"
|
|
#include "gelf.h"
|
|
#endif // !defined(WITH_LIGHTNING_COMPILER)
|
|
|
|
namespace pal {
|
|
|
|
Segment::Segment() : gpuAccess_(nullptr), cpuAccess_(nullptr), cpuMem_(nullptr) {}
|
|
|
|
Segment::~Segment() {
|
|
delete gpuAccess_;
|
|
DestroyCpuAccess();
|
|
}
|
|
|
|
void Segment::DestroyCpuAccess() {
|
|
if (cpuAccess_ != nullptr) {
|
|
cpuAccess_->unmap(nullptr);
|
|
delete cpuAccess_;
|
|
cpuAccess_ = nullptr;
|
|
}
|
|
if (cpuMem_ != nullptr) {
|
|
delete[] cpuMem_;
|
|
cpuMem_ = nullptr;
|
|
}
|
|
}
|
|
|
|
bool Segment::alloc(HSAILProgram& prog, amdgpu_hsa_elf_segment_t segment, size_t size, size_t align,
|
|
bool zero) {
|
|
align = amd::alignUp(align, sizeof(uint32_t));
|
|
gpuAccess_ = new pal::Memory(prog.dev(), amd::alignUp(size, align));
|
|
if ((gpuAccess_ == nullptr) || !gpuAccess_->create(pal::Resource::Shader)) {
|
|
delete gpuAccess_;
|
|
gpuAccess_ = nullptr;
|
|
return false;
|
|
}
|
|
if (segment == AMDGPU_HSA_SEGMENT_CODE_AGENT) {
|
|
void* ptr = nullptr;
|
|
cpuAccess_ = new pal::Memory(prog.dev(), amd::alignUp(size, align));
|
|
if ((cpuAccess_ == nullptr) || !cpuAccess_->create(pal::Resource::Remote)) {
|
|
delete cpuAccess_;
|
|
cpuAccess_ = nullptr;
|
|
ptr = cpuMem_ = reinterpret_cast<address>(new char[amd::alignUp(size, align)]);
|
|
if (cpuMem_ == nullptr) {
|
|
return false;
|
|
}
|
|
} else {
|
|
ptr = cpuAccess_->map(nullptr, 0);
|
|
}
|
|
if (zero) {
|
|
memset(ptr, 0, size);
|
|
}
|
|
}
|
|
|
|
if (zero && !prog.isInternal()) {
|
|
uint64_t pattern = 0;
|
|
size_t patternSize = ((size % sizeof(pattern)) == 0) ? sizeof(pattern) : 1;
|
|
prog.dev().xferMgr().fillBuffer(*gpuAccess_, &pattern, patternSize,
|
|
amd::Coord3D(0), amd::Coord3D(size));
|
|
}
|
|
|
|
switch (segment) {
|
|
case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
|
|
case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT:
|
|
case AMDGPU_HSA_SEGMENT_READONLY_AGENT:
|
|
prog.addGlobalStore(gpuAccess_);
|
|
prog.setGlobalVariableTotalSize(prog.globalVariableTotalSize() + size);
|
|
break;
|
|
case AMDGPU_HSA_SEGMENT_CODE_AGENT:
|
|
prog.setCodeObjects(this, gpuAccess_, reinterpret_cast<address>(cpuAddress(0)));
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void Segment::copy(size_t offset, const void* src, size_t size) {
|
|
if (cpuAccess_ != nullptr) {
|
|
amd::Os::fastMemcpy(cpuAddress(offset), src, size);
|
|
} else {
|
|
if (cpuMem_ != nullptr) {
|
|
amd::Os::fastMemcpy(cpuAddress(offset), src, size);
|
|
}
|
|
amd::ScopedLock k(gpuAccess_->dev().xferMgr().lockXfer());
|
|
VirtualGPU& gpu = *gpuAccess_->dev().xferQueue();
|
|
Memory& xferBuf = gpu.xferWrite().Acquire(size);
|
|
size_t tmpSize = std::min(static_cast<size_t>(xferBuf.size()), size);
|
|
size_t srcOffs = 0;
|
|
while (size != 0) {
|
|
xferBuf.hostWrite(&gpu, reinterpret_cast<const_address>(src) + srcOffs, 0, tmpSize);
|
|
xferBuf.partialMemCopyTo(gpu, 0, (offset + srcOffs), tmpSize, *gpuAccess_, false, true);
|
|
size -= tmpSize;
|
|
srcOffs += tmpSize;
|
|
tmpSize = std::min(static_cast<size_t>(xferBuf.size()), size);
|
|
}
|
|
gpu.xferWrite().Release(xferBuf);
|
|
gpu.waitAllEngines();
|
|
}
|
|
}
|
|
|
|
bool Segment::freeze(bool destroySysmem) {
|
|
VirtualGPU& gpu = *gpuAccess_->dev().xferQueue();
|
|
bool result = true;
|
|
if (cpuAccess_ != nullptr) {
|
|
assert(gpuAccess_->size() == cpuAccess_->size() && "Backing store size mismatch!");
|
|
amd::ScopedLock k(gpuAccess_->dev().xferMgr().lockXfer());
|
|
result = cpuAccess_->partialMemCopyTo(gpu, 0, 0, gpuAccess_->size(), *gpuAccess_, false, true);
|
|
gpu.waitAllEngines();
|
|
}
|
|
assert(!destroySysmem || (cpuAccess_ == nullptr));
|
|
return result;
|
|
}
|
|
|
|
const static char* Carrizo = "Carrizo";
|
|
HSAILProgram::HSAILProgram(Device& device)
|
|
: Program(device),
|
|
rawBinary_(nullptr),
|
|
kernels_(nullptr),
|
|
codeSegGpu_(nullptr),
|
|
codeSegment_(nullptr),
|
|
maxScratchRegs_(0),
|
|
executable_(nullptr),
|
|
loaderContext_(this) {
|
|
xnackEnabled_ = dev().hwInfo()->xnackEnabled_;
|
|
if (dev().asicRevision() == Pal::AsicRevision::Bristol) {
|
|
machineTarget_ = Carrizo;
|
|
} else {
|
|
machineTarget_ = dev().hwInfo()->targetName_;
|
|
}
|
|
loader_ = amd::hsa::loader::Loader::Create(&loaderContext_);
|
|
}
|
|
|
|
HSAILProgram::HSAILProgram(NullDevice& device)
|
|
: Program(device),
|
|
rawBinary_(nullptr),
|
|
kernels_(nullptr),
|
|
codeSegGpu_(nullptr),
|
|
codeSegment_(nullptr),
|
|
maxScratchRegs_(0),
|
|
executable_(nullptr),
|
|
loaderContext_(this) {
|
|
isNull_ = true;
|
|
xnackEnabled_ = dev().hwInfo()->xnackEnabled_;
|
|
if (dev().asicRevision() == Pal::AsicRevision::Bristol) {
|
|
machineTarget_ = Carrizo;
|
|
} else {
|
|
machineTarget_ = dev().hwInfo()->targetName_;
|
|
}
|
|
loader_ = amd::hsa::loader::Loader::Create(&loaderContext_);
|
|
}
|
|
|
|
HSAILProgram::~HSAILProgram() {
|
|
// Destroy internal static samplers
|
|
for (auto& it : staticSamplers_) {
|
|
delete it;
|
|
}
|
|
#if !defined(WITH_LIGHTNING_COMPILER)
|
|
if (rawBinary_ != nullptr) {
|
|
aclFreeMem(binaryElf_, rawBinary_);
|
|
}
|
|
acl_error error;
|
|
// Free the elf binary
|
|
if (binaryElf_ != nullptr) {
|
|
error = aclBinaryFini(binaryElf_);
|
|
if (error != ACL_SUCCESS) {
|
|
LogWarning("Error while destroying the acl binary \n");
|
|
}
|
|
}
|
|
#endif // !defined(WITH_LIGHTNING_COMPILER)
|
|
releaseClBinary();
|
|
if (executable_ != nullptr) {
|
|
loader_->DestroyExecutable(executable_);
|
|
}
|
|
delete kernels_;
|
|
amd::hsa::loader::Loader::Destroy(loader_);
|
|
}
|
|
|
|
bool HSAILProgram::initBuild(amd::option::Options* options) {
|
|
if (!device::Program::initBuild(options)) {
|
|
return false;
|
|
}
|
|
|
|
const char* devName = dev().hwInfo()->machineTarget_;
|
|
options->setPerBuildInfo((devName && (devName[0] != '\0')) ? devName : "gpu",
|
|
clBinary()->getEncryptCode(), true);
|
|
|
|
// Elf Binary setup
|
|
std::string outFileName;
|
|
|
|
// true means fsail required
|
|
clBinary()->init(options, true);
|
|
if (options->isDumpFlagSet(amd::option::DUMP_BIF)) {
|
|
outFileName = options->getDumpFileName(".bin");
|
|
}
|
|
|
|
if (!clBinary()->setElfOut(LP64_SWITCH(ELFCLASS32, ELFCLASS64),
|
|
(outFileName.size() > 0) ? outFileName.c_str() : nullptr)) {
|
|
LogError("Setup elf out for gpu failed");
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool HSAILProgram::finiBuild(bool isBuildGood) {
|
|
clBinary()->resetElfOut();
|
|
clBinary()->resetElfIn();
|
|
|
|
if (!isBuildGood) {
|
|
// Prevent the encrypted binary form leaking out
|
|
clBinary()->setBinary(nullptr, 0);
|
|
}
|
|
|
|
return device::Program::finiBuild(isBuildGood);
|
|
}
|
|
|
|
bool HSAILProgram::linkImpl(const std::vector<device::Program*>& inputPrograms,
|
|
amd::option::Options* options, bool createLibrary) {
|
|
#if defined(WITH_LIGHTNING_COMPILER)
|
|
assert(!"Should not reach here");
|
|
return false;
|
|
#else // !defined(WITH_LIGHTNING_COMPILER)
|
|
auto it = inputPrograms.cbegin();
|
|
const auto itEnd = inputPrograms.cend();
|
|
acl_error errorCode;
|
|
|
|
// For each program we need to extract the LLVMIR and create
|
|
// aclBinary for each
|
|
std::vector<aclBinary*> binaries_to_link;
|
|
|
|
for (size_t i = 0; it != itEnd; ++it, ++i) {
|
|
HSAILProgram* program = (HSAILProgram*)*it;
|
|
// Check if the program was created with clCreateProgramWIthBinary
|
|
binary_t binary = program->binary();
|
|
if ((binary.first != nullptr) && (binary.second > 0)) {
|
|
// Binary already exists -- we can also check if there is no
|
|
// opencl source code
|
|
// Need to check if LLVMIR exists in the binary
|
|
// If LLVMIR does not exist then is it valid
|
|
// We need to pull out all the compiled kernels
|
|
// We cannot do this at present because we need at least
|
|
// Hsail text to pull the kernels oout
|
|
void* mem = const_cast<void*>(binary.first);
|
|
binaryElf_ = aclReadFromMem(mem, binary.second, &errorCode);
|
|
if (errorCode != ACL_SUCCESS) {
|
|
LogWarning("Error while linking : Could not read from raw binary");
|
|
return false;
|
|
}
|
|
}
|
|
// At this stage each HSAILProgram contains a valid binary_elf
|
|
// Check if LLVMIR is in the binary
|
|
// @TODO - Memory leak , cannot free this buffer
|
|
// need to fix this.. File EPR on compiler library
|
|
size_t llvmirSize = 0;
|
|
const void* llvmirText =
|
|
aclExtractSection(dev().compiler(), binaryElf_, &llvmirSize, aclLLVMIR, &errorCode);
|
|
if (errorCode != ACL_SUCCESS) {
|
|
bool spirv = false;
|
|
size_t boolSize = sizeof(bool);
|
|
errorCode =
|
|
aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_SPIRV, nullptr, &spirv, &boolSize);
|
|
if (errorCode != ACL_SUCCESS) {
|
|
spirv = false;
|
|
}
|
|
if (spirv) {
|
|
errorCode = aclCompile(dev().compiler(), binaryElf_, options->origOptionStr.c_str(),
|
|
ACL_TYPE_SPIRV_BINARY, ACL_TYPE_LLVMIR_BINARY, nullptr);
|
|
buildLog_ += aclGetCompilerLog(dev().compiler());
|
|
if (errorCode != ACL_SUCCESS) {
|
|
buildLog_ += "Error while linking: Could not load SPIR-V";
|
|
return false;
|
|
}
|
|
} else {
|
|
buildLog_ +=
|
|
"Error while linking : \
|
|
Invalid binary (Missing LLVMIR section)";
|
|
return false;
|
|
}
|
|
}
|
|
// Create a new aclBinary for each LLVMIR and save it in a list
|
|
aclBIFVersion ver = aclBinaryVersion(binaryElf_);
|
|
aclBinary* bin = aclCreateFromBinary(binaryElf_, ver);
|
|
binaries_to_link.push_back(bin);
|
|
}
|
|
|
|
errorCode = aclLink(dev().compiler(), binaries_to_link[0], binaries_to_link.size() - 1,
|
|
binaries_to_link.size() > 1 ? &binaries_to_link[1] : NULL,
|
|
ACL_TYPE_LLVMIR_BINARY, "-create-library", NULL);
|
|
if (errorCode != ACL_SUCCESS) {
|
|
buildLog_ += aclGetCompilerLog(dev().compiler());
|
|
buildLog_ += "Error while linking : aclLink failed";
|
|
return false;
|
|
}
|
|
// Store the newly linked aclBinary for this program.
|
|
binaryElf_ = binaries_to_link[0];
|
|
// Free all the other aclBinaries
|
|
for (size_t i = 1; i < binaries_to_link.size(); i++) {
|
|
aclBinaryFini(binaries_to_link[i]);
|
|
}
|
|
if (createLibrary) {
|
|
saveBinaryAndSetType(TYPE_LIBRARY);
|
|
buildLog_ += aclGetCompilerLog(dev().compiler());
|
|
return true;
|
|
}
|
|
// Now call linkImpl with the new options
|
|
return linkImpl(options);
|
|
#endif // !defined(WITH_LIGHTNING_COMPILER)
|
|
}
|
|
|
|
inline static std::vector<std::string> splitSpaceSeparatedString(char* str) {
|
|
std::string s(str);
|
|
std::stringstream ss(s);
|
|
std::istream_iterator<std::string> beg(ss), end;
|
|
std::vector<std::string> vec(beg, end);
|
|
return vec;
|
|
}
|
|
|
|
bool HSAILProgram::linkImpl(amd::option::Options* options) {
|
|
#if defined(WITH_LIGHTNING_COMPILER)
|
|
assert(!"Should not reach here");
|
|
return false;
|
|
#else // !defined(WITH_LIGHTNING_COMPILER)
|
|
acl_error errorCode;
|
|
aclType continueCompileFrom = ACL_TYPE_LLVMIR_BINARY;
|
|
bool finalize = true;
|
|
bool hsaLoad = true;
|
|
internal_ = (compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false;
|
|
|
|
|
|
// If !binaryElf_ then program must have been created using clCreateProgramWithBinary
|
|
if (!binaryElf_) {
|
|
continueCompileFrom = getNextCompilationStageFromBinary(options);
|
|
}
|
|
switch (continueCompileFrom) {
|
|
case ACL_TYPE_SPIRV_BINARY:
|
|
case ACL_TYPE_SPIR_BINARY:
|
|
// Compilation from ACL_TYPE_LLVMIR_BINARY to ACL_TYPE_CG in cases:
|
|
// 1. if the program is not created with binary;
|
|
// 2. if the program is created with binary and contains only .llvmir & .comment
|
|
// 3. if the program is created with binary, contains .llvmir, .comment, brig sections,
|
|
// but the binary's compile & link options differ from current ones (recompilation);
|
|
case ACL_TYPE_LLVMIR_BINARY:
|
|
// Compilation from ACL_TYPE_HSAIL_BINARY to ACL_TYPE_CG in cases:
|
|
// 1. if the program is created with binary and contains only brig sections
|
|
case ACL_TYPE_HSAIL_BINARY:
|
|
// Compilation from ACL_TYPE_HSAIL_TEXT to ACL_TYPE_CG in cases:
|
|
// 1. if the program is created with binary and contains only hsail text
|
|
case ACL_TYPE_HSAIL_TEXT: {
|
|
std::string curOptions = options->origOptionStr + ProcessOptions(options);
|
|
errorCode = aclCompile(dev().compiler(), binaryElf_, curOptions.c_str(), continueCompileFrom,
|
|
ACL_TYPE_CG, nullptr);
|
|
buildLog_ += aclGetCompilerLog(dev().compiler());
|
|
if (errorCode != ACL_SUCCESS) {
|
|
buildLog_ += "Error: BRIG code generation failed.\n";
|
|
return false;
|
|
}
|
|
break;
|
|
}
|
|
case ACL_TYPE_CG:
|
|
break;
|
|
case ACL_TYPE_ISA:
|
|
finalize = false;
|
|
break;
|
|
default:
|
|
buildLog_ +=
|
|
"Error: The binary is incorrect or incomplete. Finalization to ISA couldn't be "
|
|
"performed.\n";
|
|
return false;
|
|
}
|
|
if (finalize) {
|
|
std::string fin_options(options->origOptionStr + ProcessOptions(options));
|
|
// Append an option so that we can selectively enable a SCOption on CZ
|
|
// whenever IOMMUv2 is enabled.
|
|
if (dev().settings().svmFineGrainSystem_) {
|
|
fin_options.append(" -sc-xnack-iommu");
|
|
}
|
|
if (dev().settings().gfx10Plus_) {
|
|
if (GPU_FORCE_WAVE_SIZE_32) {
|
|
fin_options.append(" -force-wave-size-32");
|
|
}
|
|
if (dev().hwInfo()->xnackEnabled_) {
|
|
fin_options.append(" -xnack");
|
|
}
|
|
}
|
|
|
|
errorCode = aclCompile(dev().compiler(), binaryElf_, fin_options.c_str(), ACL_TYPE_CG,
|
|
ACL_TYPE_ISA, nullptr);
|
|
buildLog_ += aclGetCompilerLog(dev().compiler());
|
|
if (errorCode != ACL_SUCCESS) {
|
|
buildLog_ += "Error: BRIG finalization to ISA failed.\n";
|
|
return false;
|
|
}
|
|
}
|
|
// ACL_TYPE_CG stage is not performed for offline compilation
|
|
hsa_agent_t agent;
|
|
agent.handle = 1;
|
|
if (hsaLoad) {
|
|
executable_ = loader_->CreateExecutable(HSA_PROFILE_FULL, NULL);
|
|
if (executable_ == nullptr) {
|
|
buildLog_ += "Error: Executable for AMD HSA Code Object isn't created.\n";
|
|
return false;
|
|
}
|
|
size_t size = 0;
|
|
hsa_code_object_t code_object;
|
|
code_object.handle = reinterpret_cast<uint64_t>(
|
|
aclExtractSection(dev().compiler(), binaryElf_, &size, aclTEXT, &errorCode));
|
|
if (errorCode != ACL_SUCCESS) {
|
|
buildLog_ += "Error: Extracting AMD HSA Code Object from binary failed.\n";
|
|
return false;
|
|
}
|
|
hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr);
|
|
if (status != HSA_STATUS_SUCCESS) {
|
|
buildLog_ += "Error: AMD HSA Code Object loading failed.\n";
|
|
return false;
|
|
}
|
|
status = executable_->Freeze(nullptr);
|
|
if (status != HSA_STATUS_SUCCESS) {
|
|
buildLog_ += "Error: AMD HSA Code Object freeze failed.\n";
|
|
return false;
|
|
}
|
|
}
|
|
size_t kernelNamesSize = 0;
|
|
errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr, nullptr,
|
|
&kernelNamesSize);
|
|
if (errorCode != ACL_SUCCESS) {
|
|
buildLog_ += "Error: Querying of kernel names size from the binary failed.\n";
|
|
return false;
|
|
}
|
|
if (kernelNamesSize > 0) {
|
|
char* kernelNames = new char[kernelNamesSize];
|
|
errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr, kernelNames,
|
|
&kernelNamesSize);
|
|
if (errorCode != ACL_SUCCESS) {
|
|
buildLog_ += "Error: Querying of kernel names from the binary failed.\n";
|
|
delete [] kernelNames;
|
|
return false;
|
|
}
|
|
std::vector<std::string> vKernels = splitSpaceSeparatedString(kernelNames);
|
|
delete [] kernelNames;
|
|
bool dynamicParallelism = false;
|
|
for (const auto& it : vKernels) {
|
|
std::string kernelName(it);
|
|
std::string openclKernelName = device::Kernel::openclMangledName(kernelName);
|
|
|
|
HSAILKernel* aKernel =
|
|
new HSAILKernel(kernelName, this, options->origOptionStr + ProcessOptions(options));
|
|
kernels()[kernelName] = aKernel;
|
|
|
|
amd::hsa::loader::Symbol* sym = executable_->GetSymbol(openclKernelName.c_str(), &agent);
|
|
if (!sym) {
|
|
buildLog_ += "Error: Getting kernel ISA code symbol '" + openclKernelName +
|
|
"' from AMD HSA Code Object failed. Kernel initialization failed.\n";
|
|
return false;
|
|
}
|
|
if (!aKernel->init(sym, false)) {
|
|
buildLog_ += "Error: Kernel '" + openclKernelName + "' initialization failed.\n";
|
|
return false;
|
|
}
|
|
buildLog_ += aKernel->buildLog();
|
|
aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize);
|
|
dynamicParallelism |= aKernel->dynamicParallelism();
|
|
// Find max scratch regs used in the program. It's used for scratch buffer preallocation
|
|
// with dynamic parallelism, since runtime doesn't know which child kernel will be called
|
|
maxScratchRegs_ =
|
|
std::max(static_cast<uint>(aKernel->workGroupInfo()->scratchRegs_), maxScratchRegs_);
|
|
}
|
|
// Allocate kernel table for device enqueuing
|
|
if (!isNull() && dynamicParallelism && !allocKernelTable()) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
DestroySegmentCpuAccess();
|
|
|
|
// Save the binary in the interface class
|
|
saveBinaryAndSetType(TYPE_EXECUTABLE);
|
|
buildLog_ += aclGetCompilerLog(dev().compiler());
|
|
return true;
|
|
#endif // !defined(WITH_LIGHTNING_COMPILER)
|
|
}
|
|
|
|
bool HSAILProgram::createBinary(amd::option::Options* options) { return true; }
|
|
|
|
bool HSAILProgram::allocKernelTable() {
|
|
uint size = kernels().size() * sizeof(size_t);
|
|
|
|
kernels_ = new pal::Memory(dev(), size);
|
|
// Initialize kernel table
|
|
if ((kernels_ == nullptr) || !kernels_->create(Resource::RemoteUSWC)) {
|
|
delete kernels_;
|
|
return false;
|
|
} else {
|
|
size_t* table = reinterpret_cast<size_t*>(kernels_->map(nullptr, pal::Resource::WriteOnly));
|
|
for (auto& it : kernels()) {
|
|
HSAILKernel* kernel = static_cast<HSAILKernel*>(it.second);
|
|
table[kernel->index()] = static_cast<size_t>(kernel->gpuAqlCode());
|
|
}
|
|
kernels_->unmap(nullptr);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void HSAILProgram::fillResListWithKernels(VirtualGPU& gpu) const {
|
|
gpu.addVmMemory(&codeSegGpu());
|
|
}
|
|
|
|
const aclTargetInfo& HSAILProgram::info(const char* str) {
|
|
#if defined(WITH_LIGHTNING_COMPILER)
|
|
assert(!"Should not reach here");
|
|
#else // !defined(WITH_LIGHTNING_COMPILER)
|
|
acl_error err;
|
|
std::string arch = "hsail";
|
|
if (dev().settings().use64BitPtr_) {
|
|
arch = "hsail64";
|
|
}
|
|
info_ = aclGetTargetInfo(arch.c_str(),
|
|
(str && str[0] == '\0' ? dev().hwInfo()->targetName_ : str), &err);
|
|
if (err != ACL_SUCCESS) {
|
|
LogWarning("aclGetTargetInfo failed");
|
|
}
|
|
#endif // !defined(WITH_LIGHTNING_COMPILER)
|
|
return info_;
|
|
}
|
|
|
|
bool HSAILProgram::saveBinaryAndSetType(type_t type) {
|
|
#if defined(WITH_LIGHTNING_COMPILER)
|
|
assert(!"Should not reach here");
|
|
#else // !defined(WITH_LIGHTNING_COMPILER)
|
|
// Write binary to memory
|
|
if (rawBinary_ != nullptr) {
|
|
// Free memory containing rawBinary
|
|
aclFreeMem(binaryElf_, rawBinary_);
|
|
rawBinary_ = nullptr;
|
|
}
|
|
size_t size = 0;
|
|
if (aclWriteToMem(binaryElf_, &rawBinary_, &size) != ACL_SUCCESS) {
|
|
buildLog_ += "Failed to write binary to memory \n";
|
|
return false;
|
|
}
|
|
setBinary(static_cast<char*>(rawBinary_), size);
|
|
// Set the type of binary
|
|
setType(type);
|
|
#endif // !defined(WITH_LIGHTNING_COMPILER)
|
|
return true;
|
|
}
|
|
|
|
hsa_isa_t PALHSALoaderContext::IsaFromName(const char* name) {
|
|
hsa_isa_t isa = {0};
|
|
uint32_t gfxip = 0;
|
|
std::string gfx_target(name);
|
|
if (gfx_target.find("amdgcn-") == 0) {
|
|
std::string gfxip_version_str = gfx_target.substr(gfx_target.find("gfx") + 3);
|
|
gfxip = std::atoi(gfxip_version_str.c_str());
|
|
}
|
|
else {
|
|
// FIXME: Old way. To be remove.
|
|
uint32_t shift = 1;
|
|
size_t last = gfx_target.length();
|
|
std::string ver;
|
|
do {
|
|
size_t first = gfx_target.find_last_of(':', last);
|
|
ver = gfx_target.substr(first + 1, last - first);
|
|
last = first - 1;
|
|
gfxip += static_cast<uint32_t>(atoi(ver.c_str())) * shift;
|
|
shift *= 10;
|
|
} while (shift <= 100);
|
|
}
|
|
isa.handle = gfxip;
|
|
return isa;
|
|
}
|
|
|
|
bool PALHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) {
|
|
uint32_t majorSrc = program_->dev().hwInfo()->gfxipVersion_ / 10;
|
|
uint32_t minorSrc = program_->dev().hwInfo()->gfxipVersion_ % 10;
|
|
|
|
uint32_t majorTrg = isa.handle / 10;
|
|
uint32_t minorTrg = isa.handle % 10;
|
|
|
|
if (majorSrc != majorTrg) {
|
|
return false;
|
|
}
|
|
else if (minorTrg == minorSrc) {
|
|
return true;
|
|
}
|
|
else if (minorTrg < minorSrc) {
|
|
LogWarning("ISA downgrade for execution!");
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
void* PALHSALoaderContext::SegmentAlloc(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent,
|
|
size_t size, size_t align, bool zero) {
|
|
assert(size);
|
|
assert(align);
|
|
if (program_->isNull()) {
|
|
// Note: In Linux ::posix_memalign() requires at least 16 bytes for the alignment.
|
|
align = amd::alignUp(align, 16);
|
|
void* ptr = amd::Os::alignedMalloc(size, align);
|
|
if ((ptr != nullptr) && zero) {
|
|
memset(ptr, 0, size);
|
|
}
|
|
return ptr;
|
|
}
|
|
Segment* seg = new Segment();
|
|
if (seg != nullptr && !seg->alloc(*program_, segment, size, align, zero)) {
|
|
return nullptr;
|
|
}
|
|
return seg;
|
|
}
|
|
|
|
bool PALHSALoaderContext::SegmentCopy(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent,
|
|
void* dst, size_t offset, const void* src, size_t size) {
|
|
if (program_->isNull()) {
|
|
amd::Os::fastMemcpy(reinterpret_cast<address>(dst) + offset, src, size);
|
|
return true;
|
|
}
|
|
Segment* s = reinterpret_cast<Segment*>(dst);
|
|
s->copy(offset, src, size);
|
|
return true;
|
|
}
|
|
|
|
void PALHSALoaderContext::SegmentFree(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent,
|
|
void* seg, size_t size) {
|
|
if (program_->isNull()) {
|
|
amd::Os::alignedFree(seg);
|
|
} else {
|
|
Segment* s = reinterpret_cast<Segment*>(seg);
|
|
delete s;
|
|
}
|
|
}
|
|
|
|
void* PALHSALoaderContext::SegmentAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent,
|
|
void* seg, size_t offset) {
|
|
assert(seg);
|
|
if (program_->isNull()) {
|
|
return (reinterpret_cast<address>(seg) + offset);
|
|
}
|
|
Segment* s = reinterpret_cast<Segment*>(seg);
|
|
return reinterpret_cast<void*>(s->gpuAddress(offset));
|
|
}
|
|
|
|
void* PALHSALoaderContext::SegmentHostAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent,
|
|
void* seg, size_t offset) {
|
|
assert(seg);
|
|
if (program_->isNull()) {
|
|
return (reinterpret_cast<address>(seg) + offset);
|
|
}
|
|
Segment* s = reinterpret_cast<Segment*>(seg);
|
|
return s->cpuAddress(offset);
|
|
}
|
|
|
|
bool PALHSALoaderContext::SegmentFreeze(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent,
|
|
void* seg, size_t size) {
|
|
if (program_->isNull()) {
|
|
return true;
|
|
}
|
|
Segment* s = reinterpret_cast<Segment*>(seg);
|
|
return s->freeze((segment == AMDGPU_HSA_SEGMENT_CODE_AGENT) ? false : true);
|
|
}
|
|
|
|
hsa_status_t PALHSALoaderContext::SamplerCreate(
|
|
hsa_agent_t agent, const hsa_ext_sampler_descriptor_t* sampler_descriptor,
|
|
hsa_ext_sampler_t* sampler_handle) {
|
|
if (!agent.handle) {
|
|
return HSA_STATUS_ERROR_INVALID_AGENT;
|
|
}
|
|
if (!sampler_descriptor || !sampler_handle) {
|
|
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
|
}
|
|
if (program_->isNull()) {
|
|
// Offline compilation. Provide a fake handle to avoid an assert
|
|
sampler_handle->handle = 1;
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
uint32_t state = 0;
|
|
switch (sampler_descriptor->coordinate_mode) {
|
|
case HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED:
|
|
state = amd::Sampler::StateNormalizedCoordsFalse;
|
|
break;
|
|
case HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED:
|
|
state = amd::Sampler::StateNormalizedCoordsTrue;
|
|
break;
|
|
default:
|
|
assert(false);
|
|
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
|
}
|
|
switch (sampler_descriptor->filter_mode) {
|
|
case HSA_EXT_SAMPLER_FILTER_MODE_NEAREST:
|
|
state |= amd::Sampler::StateFilterNearest;
|
|
break;
|
|
case HSA_EXT_SAMPLER_FILTER_MODE_LINEAR:
|
|
state |= amd::Sampler::StateFilterLinear;
|
|
break;
|
|
default:
|
|
assert(false);
|
|
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
|
}
|
|
switch (sampler_descriptor->address_mode) {
|
|
case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE:
|
|
state |= amd::Sampler::StateAddressClampToEdge;
|
|
break;
|
|
case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER:
|
|
state |= amd::Sampler::StateAddressClamp;
|
|
break;
|
|
case HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT:
|
|
state |= amd::Sampler::StateAddressRepeat;
|
|
break;
|
|
case HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT:
|
|
state |= amd::Sampler::StateAddressMirroredRepeat;
|
|
break;
|
|
case HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED:
|
|
state |= amd::Sampler::StateAddressNone;
|
|
break;
|
|
default:
|
|
assert(false);
|
|
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
|
}
|
|
pal::Sampler* sampler = new pal::Sampler(program_->dev());
|
|
if (!sampler || !sampler->create(state)) {
|
|
delete sampler;
|
|
return HSA_STATUS_ERROR;
|
|
}
|
|
program_->addSampler(sampler);
|
|
sampler_handle->handle = sampler->hwSrd();
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
hsa_status_t PALHSALoaderContext::SamplerDestroy(hsa_agent_t agent,
|
|
hsa_ext_sampler_t sampler_handle) {
|
|
if (!agent.handle) {
|
|
return HSA_STATUS_ERROR_INVALID_AGENT;
|
|
}
|
|
if (!sampler_handle.handle) {
|
|
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
|
}
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
#if defined(WITH_LIGHTNING_COMPILER)
|
|
|
|
static hsa_status_t GetKernelNamesCallback(hsa_executable_t hExec, hsa_executable_symbol_t hSymbol,
|
|
void* data) {
|
|
auto symbol = Symbol::Object(hSymbol);
|
|
auto symbolNameList = reinterpret_cast<std::vector<std::string>*>(data);
|
|
|
|
hsa_symbol_kind_t type;
|
|
if (!symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &type)) {
|
|
return HSA_STATUS_ERROR;
|
|
}
|
|
|
|
if (type == HSA_SYMBOL_KIND_KERNEL) {
|
|
uint32_t length;
|
|
if (!symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &length)) {
|
|
return HSA_STATUS_ERROR;
|
|
}
|
|
|
|
char* name = reinterpret_cast<char*>(alloca(length + 1));
|
|
if (!symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_NAME, name)) {
|
|
return HSA_STATUS_ERROR;
|
|
}
|
|
name[length] = '\0';
|
|
|
|
symbolNameList->push_back(std::string(name));
|
|
}
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
static hsa_status_t GetGlobalVarNamesCallback(
|
|
hsa_executable_t hExec, hsa_executable_symbol_t hSymbol,
|
|
void* data) {
|
|
auto symbol = Symbol::Object(hSymbol);
|
|
auto symbolNameList = reinterpret_cast<std::vector<std::string>*>(data);
|
|
|
|
hsa_symbol_kind_t type;
|
|
if (!symbol->GetInfo(HSA_CODE_SYMBOL_INFO_TYPE, &type)) {
|
|
return HSA_STATUS_ERROR;
|
|
}
|
|
|
|
if (type == HSA_SYMBOL_KIND_VARIABLE) {
|
|
// VariableSymbol* vsym = symbol; // Casting to the variable structure
|
|
uint32_t length;
|
|
if (!symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &length)) {
|
|
return HSA_STATUS_ERROR;
|
|
}
|
|
|
|
char* name = reinterpret_cast<char*>(alloca(length + 1));
|
|
if (!symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_NAME, name)) {
|
|
return HSA_STATUS_ERROR;
|
|
}
|
|
name[length] = '\0';
|
|
|
|
symbolNameList->push_back(std::string(name));
|
|
}
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
bool LightningProgram::createBinary(amd::option::Options* options) {
|
|
if (!clBinary()->createElfBinary(options->oVariables->BinEncrypt, type())) {
|
|
LogError("Failed to create ELF binary image!");
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool LightningProgram::linkImpl(const std::vector<Program*>& inputPrograms,
|
|
amd::option::Options* options, bool createLibrary) {
|
|
using namespace amd::opencl_driver;
|
|
std::unique_ptr<Compiler> C(newCompilerInstance());
|
|
|
|
std::vector<Data*> inputs;
|
|
for (auto program : (const std::vector<LightningProgram*>&)inputPrograms) {
|
|
if (program->llvmBinary_.empty()) {
|
|
if (program->clBinary() == NULL) {
|
|
buildLog_ += "Internal error: Input program not compiled!\n";
|
|
return false;
|
|
}
|
|
|
|
// We are using CL binary directly.
|
|
// Setup elfIn() and try to load llvmIR from binary
|
|
// This elfIn() will be released at the end of build by finiBuild().
|
|
if (!program->clBinary()->setElfIn()) {
|
|
buildLog_ += "Internal error: Setting input OCL binary failed!\n";
|
|
return false;
|
|
}
|
|
if (!program->clBinary()->loadLlvmBinary(program->llvmBinary_, program->elfSectionType_)) {
|
|
buildLog_ += "Internal error: Failed loading compiled binary!\n";
|
|
return false;
|
|
}
|
|
}
|
|
|
|
if (program->elfSectionType_ != amd::OclElf::LLVMIR) {
|
|
buildLog_ += "Error: Input binary format is not supported\n.";
|
|
return false;
|
|
}
|
|
|
|
Data* input = C->NewBufferReference(DT_LLVM_BC, (const char*)program->llvmBinary_.data(),
|
|
program->llvmBinary_.size());
|
|
|
|
if (!input) {
|
|
buildLog_ += "Internal error: Failed to open the compiled programs.\n";
|
|
return false;
|
|
}
|
|
|
|
// release elfIn() for the program
|
|
program->clBinary()->resetElfIn();
|
|
|
|
inputs.push_back(input);
|
|
}
|
|
|
|
// open the linked output
|
|
amd::opencl_driver::Buffer* output = C->NewBuffer(DT_LLVM_BC);
|
|
|
|
if (!output) {
|
|
buildLog_ += "Error: Failed to open the linked program.\n";
|
|
return false;
|
|
}
|
|
|
|
std::vector<std::string> linkOptions;
|
|
|
|
// NOTE: The params is also used to identy cached code object. This parameter
|
|
// should not contain any dyanamically generated filename.
|
|
bool ret =
|
|
dev().cacheCompilation()->linkLLVMBitcode(C.get(), inputs, output, linkOptions, buildLog_);
|
|
buildLog_ += C->Output();
|
|
if (!ret) {
|
|
buildLog_ += "Error: Linking bitcode failed: linking source & IR libraries.\n";
|
|
return false;
|
|
}
|
|
|
|
llvmBinary_.assign(output->Buf().data(), output->Size());
|
|
elfSectionType_ = amd::OclElf::LLVMIR;
|
|
|
|
|
|
if (clBinary()->saveLLVMIR()) {
|
|
clBinary()->elfOut()->addSection(amd::OclElf::LLVMIR, llvmBinary_.data(), llvmBinary_.size(),
|
|
false);
|
|
// store the original link options
|
|
clBinary()->storeLinkOptions(linkOptions_);
|
|
// store the original compile options
|
|
clBinary()->storeCompileOptions(compileOptions_);
|
|
}
|
|
|
|
// skip the rest if we are building an opencl library
|
|
if (createLibrary) {
|
|
setType(TYPE_LIBRARY);
|
|
if (!createBinary(options)) {
|
|
buildLog_ += "Internal error: creating OpenCL binary failed\n";
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
return linkImpl(options);
|
|
}
|
|
|
|
bool LightningProgram::linkImpl(amd::option::Options* options) {
|
|
using namespace amd::opencl_driver;
|
|
internal_ = (compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false;
|
|
|
|
aclType continueCompileFrom =
|
|
llvmBinary_.empty() ? getNextCompilationStageFromBinary(options) : ACL_TYPE_LLVMIR_BINARY;
|
|
|
|
if (continueCompileFrom == ACL_TYPE_ISA) {
|
|
binary_t isa = binary();
|
|
if ((isa.first != NULL) && (isa.second > 0)) {
|
|
return setKernels(options, (void*)isa.first, isa.second);
|
|
} else {
|
|
buildLog_ += "Error: code object is empty \n";
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
if (continueCompileFrom != ACL_TYPE_LLVMIR_BINARY) {
|
|
buildLog_ += "Error while Codegen phase: the binary is incomplete \n";
|
|
return false;
|
|
}
|
|
|
|
std::unique_ptr<Compiler> C(newCompilerInstance());
|
|
// call LinkLLVMBitcode
|
|
std::vector<Data*> inputs;
|
|
|
|
// open the input IR source
|
|
Data* input = C->NewBufferReference(DT_LLVM_BC, llvmBinary_.data(), llvmBinary_.size());
|
|
|
|
if (!input) {
|
|
buildLog_ += "Error: Failed to open the compiled program.\n";
|
|
return false;
|
|
}
|
|
|
|
inputs.push_back(input); //< must be the first input
|
|
|
|
// open the bitcode libraries
|
|
Data* opencl_bc =
|
|
C->NewBufferReference(DT_LLVM_BC, (const char*)opencl_amdgcn, opencl_amdgcn_size);
|
|
Data* ocml_bc = C->NewBufferReference(DT_LLVM_BC, (const char*)ocml_amdgcn, ocml_amdgcn_size);
|
|
Data* ockl_bc = C->NewBufferReference(DT_LLVM_BC, (const char*)ockl_amdgcn, ockl_amdgcn_size);
|
|
|
|
if (!opencl_bc || !ocml_bc || !ockl_bc) {
|
|
buildLog_ += "Error: Failed to open the bitcode library.\n";
|
|
return false;
|
|
}
|
|
|
|
inputs.push_back(opencl_bc); // depends on oclm & ockl
|
|
inputs.push_back(ockl_bc);
|
|
inputs.push_back(ocml_bc);
|
|
|
|
// open the control functions
|
|
auto isa_version = get_oclc_isa_version(dev().hwInfo()->gfxipVersion_);
|
|
if (!isa_version.first) {
|
|
buildLog_ += "Error: Linking for this device is not supported\n";
|
|
return false;
|
|
}
|
|
|
|
Data* isa_version_bc =
|
|
C->NewBufferReference(DT_LLVM_BC, (const char*)isa_version.first, isa_version.second);
|
|
|
|
if (!isa_version_bc) {
|
|
buildLog_ += "Error: Failed to open the control functions.\n";
|
|
return false;
|
|
}
|
|
|
|
inputs.push_back(isa_version_bc);
|
|
|
|
auto correctly_rounded_sqrt =
|
|
get_oclc_correctly_rounded_sqrt(options->oVariables->FP32RoundDivideSqrt);
|
|
Data* correctly_rounded_sqrt_bc = C->NewBufferReference(DT_LLVM_BC, correctly_rounded_sqrt.first,
|
|
correctly_rounded_sqrt.second);
|
|
|
|
auto daz_opt = get_oclc_daz_opt(options->oVariables->DenormsAreZero ||
|
|
AMD_GPU_FORCE_SINGLE_FP_DENORM == 0 ||
|
|
(dev().hwInfo()->gfxipVersion_ < 900 &&
|
|
AMD_GPU_FORCE_SINGLE_FP_DENORM < 0));
|
|
Data* daz_opt_bc = C->NewBufferReference(DT_LLVM_BC, daz_opt.first, daz_opt.second);
|
|
|
|
auto finite_only = get_oclc_finite_only(options->oVariables->FiniteMathOnly ||
|
|
options->oVariables->FastRelaxedMath);
|
|
Data* finite_only_bc = C->NewBufferReference(DT_LLVM_BC, finite_only.first, finite_only.second);
|
|
|
|
auto unsafe_math = get_oclc_unsafe_math(options->oVariables->UnsafeMathOpt ||
|
|
options->oVariables->FastRelaxedMath);
|
|
Data* unsafe_math_bc = C->NewBufferReference(DT_LLVM_BC, unsafe_math.first, unsafe_math.second);
|
|
|
|
if (!correctly_rounded_sqrt_bc || !daz_opt_bc || !finite_only_bc || !unsafe_math_bc) {
|
|
buildLog_ += "Error: Failed to open the control functions.\n";
|
|
return false;
|
|
}
|
|
|
|
inputs.push_back(correctly_rounded_sqrt_bc);
|
|
inputs.push_back(daz_opt_bc);
|
|
inputs.push_back(finite_only_bc);
|
|
inputs.push_back(unsafe_math_bc);
|
|
|
|
// open the linked output
|
|
std::vector<std::string> linkOptions;
|
|
amd::opencl_driver::Buffer* linked_bc = C->NewBuffer(DT_LLVM_BC);
|
|
|
|
if (!linked_bc) {
|
|
buildLog_ += "Error: Failed to open the linked program.\n";
|
|
return false;
|
|
}
|
|
|
|
// NOTE: The linkOptions parameter is also used to identy cached code object. This parameter
|
|
// should not contain any dyanamically generated filename.
|
|
bool ret =
|
|
dev().cacheCompilation()->linkLLVMBitcode(C.get(), inputs, linked_bc, linkOptions, buildLog_);
|
|
buildLog_ += C->Output();
|
|
if (!ret) {
|
|
buildLog_ += "Error: Linking bitcode failed: linking source & IR libraries.\n";
|
|
return false;
|
|
}
|
|
|
|
if (options->isDumpFlagSet(amd::option::DUMP_BC_LINKED)) {
|
|
std::ofstream f(options->getDumpFileName("_linked.bc").c_str(),
|
|
std::ios::binary | std::ios::trunc);
|
|
if (f.is_open()) {
|
|
f.write(linked_bc->Buf().data(), linked_bc->Size());
|
|
f.close();
|
|
} else {
|
|
buildLog_ += "Warning: opening the file to dump the linked IR failed.\n";
|
|
}
|
|
}
|
|
|
|
inputs.clear();
|
|
inputs.push_back(linked_bc);
|
|
|
|
amd::opencl_driver::Buffer* out_exec = C->NewBuffer(DT_EXECUTABLE);
|
|
if (!out_exec) {
|
|
buildLog_ += "Error: Failed to create the linked executable.\n";
|
|
return false;
|
|
}
|
|
|
|
std::string codegenOptions(options->llvmOptions);
|
|
|
|
// Set the machine target
|
|
std::ostringstream mCPU;
|
|
mCPU << " -mcpu=gfx" << dev().hwInfo()->gfxipVersion_;
|
|
codegenOptions.append(mCPU.str());
|
|
|
|
// Set xnack option if needed
|
|
if (dev().hwInfo()->xnackEnabled_) {
|
|
codegenOptions.append(" -mxnack");
|
|
}
|
|
|
|
// Set the -O#
|
|
std::ostringstream optLevel;
|
|
optLevel << "-O" << options->oVariables->OptLevel;
|
|
codegenOptions.append(" ").append(optLevel.str());
|
|
|
|
// Pass clang options
|
|
std::ostringstream ostrstr;
|
|
std::copy(options->clangOptions.begin(), options->clangOptions.end(),
|
|
std::ostream_iterator<std::string>(ostrstr, " "));
|
|
codegenOptions.append(" ").append(ostrstr.str());
|
|
|
|
// Set whole program mode
|
|
codegenOptions.append(" -mllvm -amdgpu-internalize-symbols -mllvm -amdgpu-early-inline-all");
|
|
|
|
// Tokenize the options string into a vector of strings
|
|
std::istringstream strstr(codegenOptions);
|
|
std::istream_iterator<std::string> sit(strstr), end;
|
|
std::vector<std::string> params(sit, end);
|
|
|
|
// NOTE: The params is also used to identy cached code object. This parameter
|
|
// should not contain any dyanamically generated filename.
|
|
ret = dev().cacheCompilation()->compileAndLinkExecutable(C.get(), inputs, out_exec, params,
|
|
buildLog_);
|
|
buildLog_ += C->Output();
|
|
if (!ret) {
|
|
buildLog_ += "Error: Creating the executable failed: Compiling LLVM IRs to exeutable\n";
|
|
return false;
|
|
}
|
|
|
|
if (options->isDumpFlagSet(amd::option::DUMP_O)) {
|
|
std::ofstream f(options->getDumpFileName(".so").c_str(), std::ios::binary | std::ios::trunc);
|
|
if (f.is_open()) {
|
|
f.write(out_exec->Buf().data(), out_exec->Size());
|
|
f.close();
|
|
} else {
|
|
buildLog_ += "Warning: opening the file to dump the code object failed.\n";
|
|
}
|
|
}
|
|
|
|
if (options->isDumpFlagSet(amd::option::DUMP_ISA)) {
|
|
std::string name = options->getDumpFileName(".s");
|
|
File* dump = C->NewFile(DT_INTERNAL, name);
|
|
if (!C->DumpExecutableAsText(out_exec, dump)) {
|
|
buildLog_ += "Warning: failed to dump code object.\n";
|
|
}
|
|
}
|
|
|
|
return setKernels(options, out_exec->Buf().data(), out_exec->Size());
|
|
}
|
|
|
|
bool LightningProgram::setKernels(amd::option::Options* options, void* binary, size_t size) {
|
|
hsa_agent_t agent;
|
|
agent.handle = 1;
|
|
|
|
executable_ = loader_->CreateExecutable(HSA_PROFILE_FULL, NULL);
|
|
if (executable_ == nullptr) {
|
|
buildLog_ += "Error: Executable for AMD HSA Code Object isn't created.\n";
|
|
return false;
|
|
}
|
|
|
|
hsa_code_object_t code_object;
|
|
code_object.handle = reinterpret_cast<uint64_t>(binary);
|
|
|
|
hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr);
|
|
if (status != HSA_STATUS_SUCCESS) {
|
|
buildLog_ += "Error: AMD HSA Code Object loading failed.\n";
|
|
return false;
|
|
}
|
|
|
|
status = executable_->Freeze(nullptr);
|
|
if (status != HSA_STATUS_SUCCESS) {
|
|
buildLog_ += "Error: Freezing the executable failed: ";
|
|
return false;
|
|
}
|
|
|
|
size_t progvarsTotalSize = 0;
|
|
size_t dynamicSize = 0;
|
|
size_t progvarsWriteSize = 0;
|
|
|
|
// Begin the Elf image from memory
|
|
Elf* e = elf_memory((char*)binary, size, NULL);
|
|
if (elf_kind(e) != ELF_K_ELF) {
|
|
buildLog_ += "Error while reading the ELF program binary\n";
|
|
return false;
|
|
}
|
|
|
|
size_t numpHdrs;
|
|
if (elf_getphdrnum(e, &numpHdrs) != 0) {
|
|
buildLog_ += "Error while reading the ELF program binary\n";
|
|
return false;
|
|
}
|
|
|
|
for (size_t i = 0; i < numpHdrs; ++i) {
|
|
GElf_Phdr pHdr;
|
|
if (gelf_getphdr(e, i, &pHdr) != &pHdr) {
|
|
continue;
|
|
}
|
|
// Look for the runtime metadata note
|
|
if (pHdr.p_type == PT_NOTE && pHdr.p_align >= sizeof(int)) {
|
|
// Iterate over the notes in this segment
|
|
address ptr = (address)binary + pHdr.p_offset;
|
|
address segmentEnd = ptr + pHdr.p_filesz;
|
|
|
|
while (ptr < segmentEnd) {
|
|
Elf_Note* note = (Elf_Note*)ptr;
|
|
address name = (address)¬e[1];
|
|
address desc = name + amd::alignUp(note->n_namesz, sizeof(int));
|
|
|
|
//! @todo: Use constants and enums defined in AMDGPUPTNote.h.
|
|
//! In order to switch to using constants and enums defined in
|
|
//! AMDGPUPTNote.h, we need to clean up internal header files.
|
|
if (note->n_type == 7 || note->n_type == 8) {
|
|
buildLog_ +=
|
|
"Error: object code with old metadata is not "
|
|
"supported\n";
|
|
return false;
|
|
} else if (note->n_type == 10 /*AMDGPU::ElfNote::NT_AMDGPU_HSA_CODE_OBJECT_METADATA*/
|
|
&& note->n_namesz == sizeof "AMD" && !memcmp(name, "AMD", note->n_namesz)) {
|
|
std::string metadataStr((const char*)desc, (size_t)note->n_descsz);
|
|
metadata_ = new CodeObjectMD();
|
|
if (llvm::AMDGPU::HSAMD::fromString(metadataStr, *metadata_)) {
|
|
buildLog_ += "Error: failed to process metadata\n";
|
|
return false;
|
|
}
|
|
// We've found and loaded the runtime metadata, exit the
|
|
// note record loop now.
|
|
break;
|
|
}
|
|
ptr += sizeof(*note) + amd::alignUp(note->n_namesz, sizeof(int)) +
|
|
amd::alignUp(note->n_descsz, sizeof(int));
|
|
}
|
|
}
|
|
// Accumulate the size of R & !X loadable segments
|
|
else if (pHdr.p_type == PT_LOAD && !(pHdr.p_flags & PF_X)) {
|
|
if (pHdr.p_flags & PF_R) {
|
|
progvarsTotalSize += pHdr.p_memsz;
|
|
}
|
|
if (pHdr.p_flags & PF_W) {
|
|
progvarsWriteSize += pHdr.p_memsz;
|
|
}
|
|
}
|
|
else if (pHdr.p_type == PT_DYNAMIC) {
|
|
dynamicSize += pHdr.p_memsz;
|
|
}
|
|
}
|
|
|
|
elf_end(e);
|
|
|
|
if (!metadata_) {
|
|
buildLog_ +=
|
|
"Error: runtime metadata section not present in "
|
|
"ELF program binary\n";
|
|
return false;
|
|
}
|
|
|
|
progvarsTotalSize -= dynamicSize;
|
|
setGlobalVariableTotalSize(progvarsTotalSize);
|
|
|
|
// Get the list of kernels
|
|
std::vector<std::string> kernelNameList;
|
|
status = executable_->IterateSymbols(GetKernelNamesCallback, &kernelNameList);
|
|
if (status != HSA_STATUS_SUCCESS) {
|
|
buildLog_ += "Error: Failed to get kernel names\n";
|
|
return false;
|
|
}
|
|
|
|
for (const auto& kernelName : kernelNameList) {
|
|
auto kernel =
|
|
new LightningKernel(kernelName, this, options->origOptionStr + ProcessOptions(options));
|
|
|
|
kernels()[kernelName] = kernel;
|
|
|
|
auto symbol = executable_->GetSymbol(kernelName.c_str(), &agent);
|
|
if (!symbol) {
|
|
buildLog_ += "Error: Getting kernel symbol '" + kernelName +
|
|
"' from AMD HSA Code Object failed. "
|
|
"Kernel initialization failed.\n";
|
|
return false;
|
|
}
|
|
if (!kernel->init(symbol)) {
|
|
buildLog_ += "Error: Kernel '" + kernelName + "' initialization failed.\n";
|
|
return false;
|
|
}
|
|
buildLog_ += kernel->buildLog();
|
|
|
|
kernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize);
|
|
|
|
// Find max scratch regs used in the program. It's used for scratch buffer preallocation
|
|
// with dynamic parallelism, since runtime doesn't know which child kernel will be called
|
|
maxScratchRegs_ =
|
|
std::max(static_cast<uint>(kernel->workGroupInfo()->scratchRegs_), maxScratchRegs_);
|
|
}
|
|
|
|
// Get the list of global variables
|
|
std::vector<std::string> glbVarNames;
|
|
status = executable_->IterateSymbols(GetGlobalVarNamesCallback, &glbVarNames);
|
|
if (status != HSA_STATUS_SUCCESS) {
|
|
buildLog_ += "Error: Failed to get kernel names\n";
|
|
return false;
|
|
}
|
|
hasGlobalStores_ = (glbVarNames.size() != 0) ? true : false;
|
|
|
|
DestroySegmentCpuAccess();
|
|
|
|
// Save the binary and type
|
|
clBinary()->saveBIFBinary((char*)binary, size);
|
|
setType(TYPE_EXECUTABLE);
|
|
|
|
return true;
|
|
}
|
|
|
|
LightningProgram::~LightningProgram() { delete metadata_; }
|
|
|
|
#endif // defined(WITH_LIGHTNING_COMPILER)
|
|
|
|
} // namespace pal
|