SWDEV-277566 - Separate code object loading from building
Change-Id: I87b8178f55e8ef23762dfe11fab71665ba680f00
Этот коммит содержится в:
коммит произвёл
Jason Tang
родитель
509f528980
Коммит
211ba25b4e
@@ -366,6 +366,10 @@ bool Device::BlitProgram::create(amd::Device* device, const char* extraKernels,
|
||||
DevLogPrintfError("Build failed for Kernel: %s \n", kernels.c_str());
|
||||
return false;
|
||||
}
|
||||
if (!program_->load()) {
|
||||
DevLogPrintfError("Could not load the kernels: %s \n", kernels.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -91,10 +91,6 @@ class Device;
|
||||
struct KernelParameterDescriptor;
|
||||
struct Coord3D;
|
||||
|
||||
namespace option {
|
||||
class Options;
|
||||
} // namespace option
|
||||
|
||||
//! @note: the defines match hip values
|
||||
enum MemoryAdvice : uint32_t {
|
||||
SetReadMostly = 1, ///< Data will mostly be read and only occassionally be written to
|
||||
|
||||
@@ -1150,13 +1150,16 @@ bool Program::linkImplLC(amd::option::Options* options) {
|
||||
case FILE_TYPE_ISA: {
|
||||
amd::Comgr::destroy_data_set(inputs);
|
||||
binary_t isaBinary = binary();
|
||||
finfo_t isaFdesc = BinaryFd();
|
||||
if (GPU_DUMP_CODE_OBJECT) {
|
||||
dumpCodeObject(std::string{(const char*)isaBinary.first, isaBinary.second});
|
||||
}
|
||||
return setKernels(options, const_cast<void *>(isaBinary.first), isaBinary.second,
|
||||
isaFdesc.first, isaFdesc.second, BinaryURI());
|
||||
break;
|
||||
|
||||
if (!createKernels(const_cast<void *>(isaBinary.first), isaBinary.second,
|
||||
options->oVariables->UniformWorkGroupSize, internal_)) {
|
||||
buildLog_ += "Error: Cannot create kernels.\n";
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
default:
|
||||
buildLog_ += "Error while Codegen phase: the binary is incomplete \n";
|
||||
@@ -1284,8 +1287,9 @@ bool Program::linkImplLC(amd::option::Options* options) {
|
||||
// Destroy original memory with executable after compilation
|
||||
delete[] executable;
|
||||
|
||||
if (!setKernels(options, const_cast<void*>(clBinary()->data().first),
|
||||
clBinary()->data().second)) {
|
||||
if (!createKernels(const_cast<void*>(clBinary()->data().first), clBinary()->data().second,
|
||||
options->oVariables->UniformWorkGroupSize, internal_)) {
|
||||
buildLog_ += "Error: Cannot create kernels.\n";
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1352,17 +1356,17 @@ bool Program::linkImplHSAIL(amd::option::Options* options) {
|
||||
fin_options.append(" -sc-xnack-iommu");
|
||||
}
|
||||
|
||||
if (device().settings().enableWave32Mode_) {
|
||||
fin_options.append(" -force-wave-size-32");
|
||||
}
|
||||
if (device().settings().enableWave32Mode_) {
|
||||
fin_options.append(" -force-wave-size-32");
|
||||
}
|
||||
|
||||
if (device().settings().enableWgpMode_) {
|
||||
fin_options.append(" -force-wgp-mode");
|
||||
}
|
||||
if (device().settings().enableWgpMode_) {
|
||||
fin_options.append(" -force-wgp-mode");
|
||||
}
|
||||
|
||||
if (device().settings().hsailExplicitXnack_) {
|
||||
fin_options.append(" -xnack");
|
||||
}
|
||||
if (device().settings().hsailExplicitXnack_) {
|
||||
fin_options.append(" -xnack");
|
||||
}
|
||||
|
||||
errorCode = amd::Hsail::Compile(device().compiler(), binaryElf_, fin_options.c_str(), ACL_TYPE_CG,
|
||||
ACL_TYPE_ISA, logFunction);
|
||||
@@ -1382,8 +1386,8 @@ bool Program::linkImplHSAIL(amd::option::Options* options) {
|
||||
}
|
||||
|
||||
// Call the device layer to setup all available kernels on the actual device
|
||||
if (!setKernels(options, binary, binSize)) {
|
||||
buildLog_ += "Error: Cannot set kernel \n";
|
||||
if (!createKernels(binary, binSize, options->oVariables->UniformWorkGroupSize, internal_)) {
|
||||
buildLog_ += "Error: Cannot create kernel.\n";
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1772,6 +1776,48 @@ int32_t Program::build(const std::string& sourceCode, const char* origOptions,
|
||||
return buildError();
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool Program::loadHSAIL() {
|
||||
#if defined(WITH_COMPILER_LIB)
|
||||
acl_error errorCode;
|
||||
size_t binSize;
|
||||
void* bin = const_cast<void*>(amd::Hsail::ExtractSection(device().compiler(), binaryElf_,
|
||||
&binSize, aclTEXT, &errorCode));
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
LogPrintfError("Error: cannot extract ISA from compiled binary.\n");
|
||||
return false;
|
||||
}
|
||||
// Call the device layer to setup all available kernels on the actual device
|
||||
return setKernels(bin, binSize);
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool Program::loadLC() {
|
||||
#if defined(USE_COMGR_LIBRARY)
|
||||
return setKernels(const_cast<void*>(binary().first), binary().second,
|
||||
BinaryFd().first, BinaryFd().second, BinaryURI());
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool Program::load() {
|
||||
bool ret;
|
||||
if (isLC()) {
|
||||
ret = loadLC();
|
||||
} else {
|
||||
ret = loadHSAIL();
|
||||
}
|
||||
if (ret) {
|
||||
coLoaded_ = 1;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
std::vector<std::string> Program::ProcessOptions(amd::option::Options* options) {
|
||||
std::vector<std::string> optionsVec;
|
||||
|
||||
@@ -123,7 +123,8 @@ class Program : public amd::HeapObject {
|
||||
uint32_t internal_ : 1; //!< Internal blit program
|
||||
uint32_t isLC_ : 1; //!< LC was used for the program compilation
|
||||
uint32_t hasGlobalStores_ : 1; //!< Program has writable program scope variables
|
||||
uint32_t isHIP_ : 1; //!< Determine if the program is for HIP
|
||||
uint32_t isHIP_ : 1; //!< Determine if the program is for HIP
|
||||
uint32_t coLoaded_ : 1; //!< Has the code objected been loaded
|
||||
};
|
||||
uint32_t flags_; //!< Program flags
|
||||
};
|
||||
@@ -178,15 +179,18 @@ class Program : public amd::HeapObject {
|
||||
const char** headerIncludeNames, const char* origOptions,
|
||||
amd::option::Options* options);
|
||||
|
||||
//! Builds the device program.
|
||||
//! Link the device program.
|
||||
int32_t link(const std::vector<Program*>& inputPrograms, const char* origOptions,
|
||||
amd::option::Options* options);
|
||||
|
||||
//! Builds the device program.
|
||||
//! Build the device program.
|
||||
int32_t build(const std::string& sourceCode, const char* origOptions,
|
||||
amd::option::Options* options, const std::vector<std::string>& preCompiledHeaders);
|
||||
|
||||
//! Returns the device object, associated with this program.
|
||||
//! Load the device program.
|
||||
bool load();
|
||||
|
||||
//! Return the device object, associated with this program.
|
||||
const amd::Device& device() const { return device_(); }
|
||||
|
||||
//! Return the compiler options used to build the program.
|
||||
@@ -248,6 +252,9 @@ class Program : public amd::HeapObject {
|
||||
//! Global variables are a part of the code segment
|
||||
bool hasGlobalStores() const { return hasGlobalStores_; }
|
||||
|
||||
//! Return TRUE if the program has been loaded
|
||||
bool isCodeObjectLoaded() const { return coLoaded_; }
|
||||
|
||||
#if defined(USE_COMGR_LIBRARY)
|
||||
amd_comgr_metadata_node_t metadata() const { return metadata_; }
|
||||
|
||||
@@ -324,9 +331,11 @@ class Program : public amd::HeapObject {
|
||||
//! return target info
|
||||
virtual const aclTargetInfo& info() = 0;
|
||||
#endif
|
||||
virtual bool createKernels(void* binary, size_t binSize, bool useUniformWorkGroupSize,
|
||||
bool internalKernel) { return true; }
|
||||
|
||||
virtual bool setKernels(
|
||||
amd::option::Options* options, void* binary, size_t binSize,
|
||||
void* binary, size_t binSize,
|
||||
amd::Os::FileDesc fdesc = amd::Os::FDescInit(), size_t foffset = 0,
|
||||
std::string uri = std::string()) { return true; }
|
||||
|
||||
@@ -397,6 +406,12 @@ class Program : public amd::HeapObject {
|
||||
//! Link the device program with HSAIL path
|
||||
bool linkImplHSAIL(amd::option::Options* options);
|
||||
|
||||
//! Load the device program with LC path
|
||||
bool loadLC();
|
||||
|
||||
//! Load the device program with HSAIL path
|
||||
bool loadHSAIL();
|
||||
|
||||
#if defined(USE_COMGR_LIBRARY)
|
||||
//! Dump the log data object to the build log, if a log data object is present
|
||||
void extractBuildLog(amd_comgr_data_set_t dataSet);
|
||||
|
||||
@@ -85,15 +85,26 @@ bool HSAILKernel::setKernelCode(amd::hsa::loader::Symbol* sym, amd_kernel_code_t
|
||||
return true;
|
||||
}
|
||||
|
||||
bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) {
|
||||
bool HSAILKernel::aqlCreateHWInfo() {
|
||||
hsa_agent_t agent = {amd::Device::toHandle(&(device()))};
|
||||
std::string openclKernelName = device::Kernel::openclMangledName(name());
|
||||
amd::hsa::loader::Symbol* sym = prog().getSymbol(openclKernelName.c_str(), &agent);
|
||||
if (!sym) {
|
||||
LogPrintfError("Error: Getting kernel ISA code symbol %s from AMD HSA Code Object failed.\n",
|
||||
openclKernelName.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
amd_kernel_code_t* akc = &akc_;
|
||||
|
||||
if (!setKernelCode(sym, akc)) {
|
||||
LogPrintfError("Error: setKernelCode() failed.\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE,
|
||||
reinterpret_cast<void*>(&codeSize_))) {
|
||||
LogPrintfError("Error: sym->GetInfo() failed.\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -107,45 +118,30 @@ bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) {
|
||||
return true;
|
||||
}
|
||||
|
||||
HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions)
|
||||
HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, bool internalKernel)
|
||||
: device::Kernel(prog->device(), name, *prog),
|
||||
compileOptions_(compileOptions),
|
||||
index_(0),
|
||||
code_(0),
|
||||
codeSize_(0) {
|
||||
flags_.hsa_ = true;
|
||||
flags_.internalKernel_ = internalKernel;
|
||||
}
|
||||
|
||||
HSAILKernel::~HSAILKernel() {}
|
||||
|
||||
bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) {
|
||||
bool HSAILKernel::postLoad() {
|
||||
#if defined(WITH_COMPILER_LIB)
|
||||
if (!aqlCreateHWInfo()) {
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
bool HSAILKernel::init() {
|
||||
#if defined(WITH_COMPILER_LIB)
|
||||
acl_error error = ACL_SUCCESS;
|
||||
std::string openClKernelName = openclMangledName(name());
|
||||
flags_.internalKernel_ =
|
||||
(compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false;
|
||||
// compile kernel down to ISA
|
||||
if (finalize) {
|
||||
std::string options(compileOptions_.c_str());
|
||||
options.append(" -just-kernel=");
|
||||
options.append(openClKernelName.c_str());
|
||||
// Append an option so that we can selectively enable a SCOption on CZ
|
||||
// whenever IOMMUv2 is enabled.
|
||||
if (palNullDevice().settings().svmFineGrainSystem_) {
|
||||
options.append(" -sc-xnack-iommu");
|
||||
}
|
||||
error = amd::Hsail::Compile(palNullDevice().compiler(), prog().binaryElf(), options.c_str(), ACL_TYPE_CG,
|
||||
ACL_TYPE_ISA, nullptr);
|
||||
buildLog_ += amd::Hsail::GetCompilerLog(palNullDevice().compiler());
|
||||
if (error != ACL_SUCCESS) {
|
||||
LogError("Failed to finalize kernel");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (!aqlCreateHWInfo(sym)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Pull out metadata from the ELF
|
||||
size_t sizeOfArgList;
|
||||
@@ -437,13 +433,10 @@ const LightningProgram& LightningKernel::prog() const {
|
||||
|
||||
#if defined(USE_COMGR_LIBRARY)
|
||||
bool LightningKernel::init() {
|
||||
flags_.internalKernel_ =
|
||||
(compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false;
|
||||
|
||||
if (!GetAttrCodePropMetadata()) {
|
||||
return false;
|
||||
}
|
||||
return GetAttrCodePropMetadata();
|
||||
}
|
||||
|
||||
bool LightningKernel::postLoad() {
|
||||
if (codeObjectVer() == 2) {
|
||||
symbolName_ = name();
|
||||
}
|
||||
@@ -451,7 +444,7 @@ bool LightningKernel::init() {
|
||||
// Copy codeobject of this kernel from the program CPU segment
|
||||
hsa_agent_t agent = {amd::Device::toHandle(&(device()))};
|
||||
|
||||
auto sym = prog().GetSymbol(symbolName().c_str(), &agent);
|
||||
auto sym = prog().getSymbol(symbolName().c_str(), &agent);
|
||||
|
||||
if (!setKernelCode(sym, &akc_)) {
|
||||
return false;
|
||||
@@ -465,7 +458,7 @@ bool LightningKernel::init() {
|
||||
amd::hsa::loader::Symbol* rth_symbol;
|
||||
|
||||
// Get the runtime handle symbol GPU address
|
||||
rth_symbol = prog().GetSymbol(RuntimeHandle().c_str(), &agent);
|
||||
rth_symbol = prog().getSymbol(RuntimeHandle().c_str(), &agent);
|
||||
uint64_t symbol_address;
|
||||
rth_symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &symbol_address);
|
||||
|
||||
|
||||
@@ -60,13 +60,15 @@ class LightningProgram;
|
||||
*/
|
||||
class HSAILKernel : public device::Kernel {
|
||||
public:
|
||||
HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions);
|
||||
HSAILKernel(std::string name, HSAILProgram* prog, bool internalKernel);
|
||||
|
||||
virtual ~HSAILKernel();
|
||||
|
||||
//! Initializes the metadata required for this kernel,
|
||||
//! finalizes the kernel if needed
|
||||
bool init(amd::hsa::loader::Symbol* sym, bool finalize = false);
|
||||
bool init();
|
||||
|
||||
//! Setup after code object loading
|
||||
bool postLoad();
|
||||
|
||||
//! Returns PAL, possibly null, device object, associated with this kernel.
|
||||
const NullDevice& palNullDevice() const { return reinterpret_cast<const NullDevice&>(dev_); }
|
||||
@@ -122,7 +124,7 @@ class HSAILKernel : public device::Kernel {
|
||||
|
||||
protected:
|
||||
//! Creates AQL kernel HW info
|
||||
bool aqlCreateHWInfo(amd::hsa::loader::Symbol* sym);
|
||||
bool aqlCreateHWInfo();
|
||||
|
||||
//! Get the kernel code and copy the code object from the program CPU segment
|
||||
bool setKernelCode(amd::hsa::loader::Symbol* sym, amd_kernel_code_t* akc);
|
||||
@@ -131,7 +133,6 @@ class HSAILKernel : public device::Kernel {
|
||||
void setWorkGroupInfo(const uint32_t privateSegmentSize, const uint32_t groupSegmentSize,
|
||||
const uint16_t numSGPRs, const uint16_t numVGPRs);
|
||||
|
||||
std::string compileOptions_; //!< compile used for finalizing this kernel
|
||||
amd_kernel_code_t akc_; //!< AQL kernel code on CPU
|
||||
uint index_; //!< Kernel index in the program
|
||||
|
||||
@@ -141,18 +142,18 @@ class HSAILKernel : public device::Kernel {
|
||||
|
||||
class LightningKernel : public HSAILKernel {
|
||||
public:
|
||||
LightningKernel(const std::string& name, HSAILProgram* prog, const std::string& compileOptions)
|
||||
: HSAILKernel(name, prog, compileOptions) {}
|
||||
LightningKernel(const std::string& name, HSAILProgram* prog, bool internalKernel)
|
||||
: HSAILKernel(name, prog, internalKernel) {}
|
||||
|
||||
//! Returns Lightning program associated with this kernel
|
||||
const LightningProgram& prog() const;
|
||||
|
||||
//! Initializes the metadata required for this kernel,
|
||||
bool init(amd::hsa::loader::Symbol* symbol);
|
||||
|
||||
#if defined(USE_COMGR_LIBRARY)
|
||||
//! Initializes the metadata required for this kernel,
|
||||
//! Initializes the metadata required for this kernel
|
||||
bool init();
|
||||
|
||||
//! Setup after code object loading
|
||||
bool postLoad();
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
@@ -243,7 +243,44 @@ inline static std::vector<std::string> splitSpaceSeparatedString(char* str) {
|
||||
return vec;
|
||||
}
|
||||
|
||||
bool HSAILProgram::setKernels(amd::option::Options* options, void* binary, size_t binSize,
|
||||
bool HSAILProgram::createKernels(void* binary, size_t binSize, bool useUniformWorkGroupSize,
|
||||
bool internalKernel) {
|
||||
size_t kernelNamesSize = 0;
|
||||
acl_error errorCode = amd::Hsail::QueryInfo(palNullDevice().compiler(), binaryElf_,
|
||||
RT_KERNEL_NAMES, nullptr, nullptr, &kernelNamesSize);
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
buildLog_ += "Error: Querying of kernel names size from the binary failed.\n";
|
||||
return false;
|
||||
}
|
||||
if (kernelNamesSize > 0) {
|
||||
char* kernelNames = new char[kernelNamesSize];
|
||||
errorCode = amd::Hsail::QueryInfo(palNullDevice().compiler(), binaryElf_, RT_KERNEL_NAMES,
|
||||
nullptr, kernelNames, &kernelNamesSize);
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
buildLog_ += "Error: Querying of kernel names from the binary failed.\n";
|
||||
delete[] kernelNames;
|
||||
return false;
|
||||
}
|
||||
std::vector<std::string> vKernels = splitSpaceSeparatedString(kernelNames);
|
||||
delete[] kernelNames;
|
||||
for (const auto& it : vKernels) {
|
||||
std::string kernelName(it);
|
||||
|
||||
HSAILKernel* aKernel = new HSAILKernel(kernelName, this, internalKernel);
|
||||
kernels()[kernelName] = aKernel;
|
||||
|
||||
if (!aKernel->init()) {
|
||||
buildLog_ += "Error: Kernel initialization failed.\n";
|
||||
return false;
|
||||
}
|
||||
|
||||
aKernel->setUniformWorkGroupSize(useUniformWorkGroupSize);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool HSAILProgram::setKernels(void* binary, size_t binSize,
|
||||
amd::Os::FileDesc fdesc, size_t foffset, std::string uri) {
|
||||
#if defined(WITH_COMPILER_LIB)
|
||||
// Stop compilation if it is an offline device - PAL runtime does not
|
||||
@@ -275,56 +312,23 @@ bool HSAILProgram::setKernels(amd::option::Options* options, void* binary, size_
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t kernelNamesSize = 0;
|
||||
acl_error errorCode = amd::Hsail::QueryInfo(palNullDevice().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr,
|
||||
nullptr, &kernelNamesSize);
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
buildLog_ += "Error: Querying of kernel names size from the binary failed.\n";
|
||||
return false;
|
||||
bool dynamicParallelism = false;
|
||||
for (auto& kit : kernels()) {
|
||||
HSAILKernel* aKernel = static_cast<HSAILKernel*>(kit.second);
|
||||
if (!aKernel->postLoad()) {
|
||||
return false;
|
||||
}
|
||||
dynamicParallelism |= aKernel->dynamicParallelism();
|
||||
// Find max scratch regs used in the program. It's used for scratch buffer preallocation
|
||||
// with dynamic parallelism, since runtime doesn't know which child kernel will be called
|
||||
maxScratchRegs_ =
|
||||
std::max(static_cast<uint>(aKernel->workGroupInfo()->scratchRegs_), maxScratchRegs_);
|
||||
maxVgprs_ = std::max(static_cast<uint>(aKernel->workGroupInfo()->usedVGPRs_), maxVgprs_);
|
||||
}
|
||||
if (kernelNamesSize > 0) {
|
||||
char* kernelNames = new char[kernelNamesSize];
|
||||
errorCode = amd::Hsail::QueryInfo(palNullDevice().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr, kernelNames,
|
||||
&kernelNamesSize);
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
buildLog_ += "Error: Querying of kernel names from the binary failed.\n";
|
||||
delete[] kernelNames;
|
||||
return false;
|
||||
}
|
||||
std::vector<std::string> vKernels = splitSpaceSeparatedString(kernelNames);
|
||||
delete[] kernelNames;
|
||||
bool dynamicParallelism = false;
|
||||
for (const auto& it : vKernels) {
|
||||
std::string kernelName(it);
|
||||
std::string openclKernelName = device::Kernel::openclMangledName(kernelName);
|
||||
|
||||
HSAILKernel* aKernel =
|
||||
new HSAILKernel(kernelName, this, options->origOptionStr + ProcessOptionsFlattened(options));
|
||||
kernels()[kernelName] = aKernel;
|
||||
|
||||
amd::hsa::loader::Symbol* sym = executable_->GetSymbol(openclKernelName.c_str(), &agent);
|
||||
if (!sym) {
|
||||
buildLog_ += "Error: Getting kernel ISA code symbol '" + openclKernelName +
|
||||
"' from AMD HSA Code Object failed. Kernel initialization failed.\n";
|
||||
return false;
|
||||
}
|
||||
if (!aKernel->init(sym, false)) {
|
||||
buildLog_ += "Error: Kernel '" + openclKernelName + "' initialization failed.\n";
|
||||
return false;
|
||||
}
|
||||
buildLog_ += aKernel->buildLog();
|
||||
aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize);
|
||||
dynamicParallelism |= aKernel->dynamicParallelism();
|
||||
// Find max scratch regs used in the program. It's used for scratch buffer preallocation
|
||||
// with dynamic parallelism, since runtime doesn't know which child kernel will be called
|
||||
maxScratchRegs_ =
|
||||
std::max(static_cast<uint>(aKernel->workGroupInfo()->scratchRegs_), maxScratchRegs_);
|
||||
maxVgprs_ = std::max(static_cast<uint>(aKernel->workGroupInfo()->usedVGPRs_), maxVgprs_);
|
||||
}
|
||||
// Allocate kernel table for device enqueuing
|
||||
if (!isNull() && dynamicParallelism && !allocKernelTable()) {
|
||||
return false;
|
||||
}
|
||||
// Allocate kernel table for device enqueuing
|
||||
if (!isNull() && dynamicParallelism && !allocKernelTable()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
DestroySegmentCpuAccess();
|
||||
@@ -731,7 +735,34 @@ bool LightningProgram::createBinary(amd::option::Options* options) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool LightningProgram::setKernels(amd::option::Options* options, void* binary, size_t binSize,
|
||||
bool LightningProgram::createKernels(void* binary, size_t binSize, bool useUniformWorkGroupSize,
|
||||
bool internalKernel) {
|
||||
#if defined(USE_COMGR_LIBRARY)
|
||||
// Find the size of global variables from the binary
|
||||
if (!FindGlobalVarSize(binary, binSize)) {
|
||||
buildLog_ += "Error: Cannot Find Global Var Sizes\n";
|
||||
return false;
|
||||
}
|
||||
|
||||
for (const auto& kernelMeta : kernelMetadataMap_) {
|
||||
auto kernelName = kernelMeta.first;
|
||||
auto kernel = new LightningKernel(kernelName, this, internalKernel);
|
||||
if (kernel == nullptr) {
|
||||
return false;
|
||||
}
|
||||
if (!kernel->init()) {
|
||||
buildLog_ += "[ROC][Kernel] Could not get Code Prop Meta Data \n";
|
||||
return false;
|
||||
}
|
||||
kernels()[kernelName] = kernel;
|
||||
|
||||
kernel->setUniformWorkGroupSize(useUniformWorkGroupSize);
|
||||
}
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
bool LightningProgram::setKernels(void* binary, size_t binSize,
|
||||
amd::Os::FileDesc fdesc, size_t foffset, std::string uri) {
|
||||
#if defined(USE_COMGR_LIBRARY)
|
||||
// Stop compilation if it is an offline device - PAL runtime does not
|
||||
@@ -742,7 +773,7 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s
|
||||
|
||||
executable_ = loader_->CreateExecutable(HSA_PROFILE_FULL, nullptr);
|
||||
if (executable_ == nullptr) {
|
||||
buildLog_ += "Error: Executable for AMD HSA Code Object isn't created.\n";
|
||||
LogPrintfError("Error: Executable for AMD HSA Code Object isn't created.\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -753,33 +784,21 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s
|
||||
|
||||
hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
buildLog_ += "Error: AMD HSA Code Object loading failed.\n";
|
||||
LogPrintfError("Error: AMD HSA Code Object loading failed.\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
status = executable_->Freeze(nullptr);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
buildLog_ += "Error: Freezing the executable failed: ";
|
||||
LogPrintfError("Error: Freezing the executable failed.\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Find the size of global variables from the binary
|
||||
if (!FindGlobalVarSize(binary, binSize)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (const auto& kernelMeta : kernelMetadataMap_) {
|
||||
auto kernelName = kernelMeta.first;
|
||||
auto kernel =
|
||||
new LightningKernel(kernelName, this, options->origOptionStr + ProcessOptionsFlattened(options));
|
||||
kernels()[kernelName] = kernel;
|
||||
|
||||
if (!kernel->init()) {
|
||||
for (auto& kit : kernels()) {
|
||||
LightningKernel* kernel = static_cast<LightningKernel*>(kit.second);
|
||||
if (!kernel->postLoad()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
kernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize);
|
||||
|
||||
// Find max scratch regs used in the program. It's used for scratch buffer preallocation
|
||||
// with dynamic parallelism, since runtime doesn't know which child kernel will be called
|
||||
maxScratchRegs_ =
|
||||
|
||||
@@ -196,7 +196,7 @@ class HSAILProgram : public device::Program {
|
||||
}
|
||||
|
||||
//! Get symbol by name
|
||||
amd::hsa::loader::Symbol* GetSymbol(const char* symbol_name, const hsa_agent_t* agent) const {
|
||||
amd::hsa::loader::Symbol* getSymbol(const char* symbol_name, const hsa_agent_t* agent) const {
|
||||
return executable_->GetSymbol(symbol_name, agent);
|
||||
}
|
||||
|
||||
@@ -208,8 +208,10 @@ class HSAILProgram : public device::Program {
|
||||
#if defined(WITH_COMPILER_LIB)
|
||||
virtual const aclTargetInfo& info();
|
||||
#endif
|
||||
virtual bool createKernels(void* binary, size_t binSize, bool useUniformWorkGroupSize,
|
||||
bool internalKernel) override;
|
||||
|
||||
virtual bool setKernels(amd::option::Options* options, void* binary, size_t binSize,
|
||||
virtual bool setKernels(void* binary, size_t binSize,
|
||||
amd::Os::FileDesc fdesc = amd::Os::FDescInit(), size_t foffset = 0,
|
||||
std::string uri = std::string()) override;
|
||||
|
||||
@@ -267,7 +269,10 @@ class LightningProgram : public HSAILProgram {
|
||||
virtual ~LightningProgram() {}
|
||||
|
||||
protected:
|
||||
virtual bool setKernels(amd::option::Options* options, void* binary, size_t binSize,
|
||||
virtual bool createKernels(void* binary, size_t binSize, bool useUniformWorkGroupSize,
|
||||
bool internalKernel) override;
|
||||
|
||||
virtual bool setKernels(void* binary, size_t binSize,
|
||||
amd::Os::FileDesc fdesc = amd::Os::FDescInit(), size_t foffset = 0,
|
||||
std::string uri = std::string()) override;
|
||||
|
||||
|
||||
@@ -45,11 +45,10 @@ Kernel::Kernel(std::string name, Program* prog)
|
||||
|
||||
#if defined(USE_COMGR_LIBRARY)
|
||||
bool LightningKernel::init() {
|
||||
if (!GetAttrCodePropMetadata()) {
|
||||
LogError("[ROC][Kernel] Could not get Code Prop Meta Data \n");
|
||||
return false;
|
||||
}
|
||||
return GetAttrCodePropMetadata();
|
||||
}
|
||||
|
||||
bool LightningKernel::postLoad() {
|
||||
// Set the kernel symbol name and size/alignment based on the kernel metadata
|
||||
// NOTE: kernel name is used to get the kernel code handle in V2,
|
||||
// but kernel symbol name is used in V3
|
||||
|
||||
@@ -79,6 +79,9 @@ class LightningKernel : public roc::Kernel {
|
||||
|
||||
//! Initializes the metadata required for this kernel
|
||||
virtual bool init() final;
|
||||
|
||||
//! Setup after code object loading
|
||||
bool postLoad();
|
||||
};
|
||||
|
||||
} // namespace roc
|
||||
|
||||
@@ -219,7 +219,7 @@ bool HSAILProgram::saveBinaryAndSetType(type_t type) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool HSAILProgram::setKernels(amd::option::Options* options, void* binary, size_t binSize,
|
||||
bool HSAILProgram::setKernels(void* binary, size_t binSize,
|
||||
amd::Os::FileDesc fdesc, size_t foffset, std::string uri) {
|
||||
return true;
|
||||
}
|
||||
@@ -263,7 +263,28 @@ bool LightningProgram::saveBinaryAndSetType(type_t type, void* rawBinary, size_t
|
||||
return true;
|
||||
}
|
||||
|
||||
bool LightningProgram::setKernels(amd::option::Options* options, void* binary, size_t binSize,
|
||||
bool LightningProgram::createKernels(void* binary, size_t binSize, bool useUniformWorkGroupSize,
|
||||
bool internalKernel) {
|
||||
// Find the size of global variables from the binary
|
||||
if (!FindGlobalVarSize(binary, binSize)) {
|
||||
buildLog_ += "Error: Cannot Find Global Var Sizes\n";
|
||||
return false;
|
||||
}
|
||||
|
||||
for (const auto &kernelMeta : kernelMetadataMap_) {
|
||||
const std::string kernelName = kernelMeta.first;
|
||||
Kernel* aKernel = new roc::LightningKernel(kernelName, this);
|
||||
if (!aKernel->init()) {
|
||||
return false;
|
||||
}
|
||||
aKernel->setUniformWorkGroupSize(useUniformWorkGroupSize);
|
||||
aKernel->setInternalKernelFlag(internalKernel);
|
||||
kernels()[kernelName] = aKernel;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool LightningProgram::setKernels(void* binary, size_t binSize,
|
||||
amd::Os::FileDesc fdesc, size_t foffset, std::string uri) {
|
||||
#if defined(USE_COMGR_LIBRARY)
|
||||
// Stop compilation if it is an offline device - HSA runtime does not
|
||||
@@ -272,13 +293,6 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s
|
||||
return true;
|
||||
}
|
||||
|
||||
// Find the size of global variables from the binary
|
||||
if (!FindGlobalVarSize(binary, binSize)) {
|
||||
buildLog_ += "Error: Cannot Global Var Sizes ";
|
||||
buildLog_ += "\n";
|
||||
return false;
|
||||
}
|
||||
|
||||
hsa_agent_t agent = rocDevice().getBackendDevice();
|
||||
hsa_status_t status;
|
||||
|
||||
@@ -320,16 +334,11 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s
|
||||
return false;
|
||||
}
|
||||
|
||||
for (const auto &kernelMeta : kernelMetadataMap_) {
|
||||
const std::string kernelName = kernelMeta.first;
|
||||
Kernel* aKernel = new roc::LightningKernel(kernelName, this);
|
||||
if (!aKernel->init()) {
|
||||
for (auto& kit : kernels()) {
|
||||
LightningKernel* kernel = static_cast<LightningKernel*>(kit.second);
|
||||
if (!kernel->postLoad()) {
|
||||
return false;
|
||||
}
|
||||
aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize);
|
||||
aKernel->setInternalKernelFlag(compileOptions_.find("-cl-internal-kernel") !=
|
||||
std::string::npos);
|
||||
kernels()[kernelName] = aKernel;
|
||||
}
|
||||
#endif // defined(USE_COMGR_LIBRARY)
|
||||
return true;
|
||||
|
||||
@@ -94,7 +94,7 @@ class HSAILProgram : public roc::Program {
|
||||
protected:
|
||||
bool createBinary(amd::option::Options* options) override { return true; }
|
||||
|
||||
virtual bool setKernels(amd::option::Options* options, void* binary, size_t binSize,
|
||||
virtual bool setKernels(void* binary, size_t binSize,
|
||||
amd::Os::FileDesc fdesc = amd::Os::FDescInit(), size_t foffset = 0,
|
||||
std::string uri = std::string()) override;
|
||||
|
||||
@@ -117,9 +117,12 @@ protected:
|
||||
private:
|
||||
bool saveBinaryAndSetType(type_t type, void* rawBinary, size_t size);
|
||||
|
||||
bool setKernels(amd::option::Options* options, void* binary, size_t binSize,
|
||||
bool createKernels(void* binary, size_t binSize, bool useUniformWorkGroupSize,
|
||||
bool internalKernel) override final;
|
||||
|
||||
bool setKernels(void* binary, size_t binSize,
|
||||
amd::Os::FileDesc fdesc = amd::Os::FDescInit(), size_t foffset = 0,
|
||||
std::string uri = std::string()) final;
|
||||
std::string uri = std::string()) override final;
|
||||
};
|
||||
|
||||
/*@}*/} // namespace roc
|
||||
|
||||
@@ -622,6 +622,28 @@ int32_t Program::build(const std::vector<Device*>& devices, const char* options,
|
||||
}
|
||||
|
||||
bool Program::load(const std::vector<Device*>& devices) {
|
||||
ScopedLock sl(buildLock_);
|
||||
|
||||
for (const auto& it : devicePrograms_) {
|
||||
const Device& device = *(it.first);
|
||||
|
||||
// If devices is specified, only load code object for those devices
|
||||
if (std::find(devices.begin(), devices.end(), &device) != devices.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
device::Program& devProgram = *(it.second);
|
||||
|
||||
// Only load the code object once
|
||||
if (devProgram.isCodeObjectLoaded()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!devProgram.load()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user