diff --git a/rocclr/device/device.cpp b/rocclr/device/device.cpp index a45f0b76c4..2f5c717545 100644 --- a/rocclr/device/device.cpp +++ b/rocclr/device/device.cpp @@ -366,6 +366,10 @@ bool Device::BlitProgram::create(amd::Device* device, const char* extraKernels, DevLogPrintfError("Build failed for Kernel: %s \n", kernels.c_str()); return false; } + if (!program_->load()) { + DevLogPrintfError("Could not load the kernels: %s \n", kernels.c_str()); + return false; + } return true; } diff --git a/rocclr/device/device.hpp b/rocclr/device/device.hpp index a46ee414f9..badb62d1e8 100644 --- a/rocclr/device/device.hpp +++ b/rocclr/device/device.hpp @@ -91,10 +91,6 @@ class Device; struct KernelParameterDescriptor; struct Coord3D; -namespace option { -class Options; -} // namespace option - //! @note: the defines match hip values enum MemoryAdvice : uint32_t { SetReadMostly = 1, ///< Data will mostly be read and only occassionally be written to diff --git a/rocclr/device/devprogram.cpp b/rocclr/device/devprogram.cpp index 79cb22eb37..50100f73d7 100644 --- a/rocclr/device/devprogram.cpp +++ b/rocclr/device/devprogram.cpp @@ -1150,13 +1150,16 @@ bool Program::linkImplLC(amd::option::Options* options) { case FILE_TYPE_ISA: { amd::Comgr::destroy_data_set(inputs); binary_t isaBinary = binary(); - finfo_t isaFdesc = BinaryFd(); if (GPU_DUMP_CODE_OBJECT) { dumpCodeObject(std::string{(const char*)isaBinary.first, isaBinary.second}); } - return setKernels(options, const_cast(isaBinary.first), isaBinary.second, - isaFdesc.first, isaFdesc.second, BinaryURI()); - break; + + if (!createKernels(const_cast(isaBinary.first), isaBinary.second, + options->oVariables->UniformWorkGroupSize, internal_)) { + buildLog_ += "Error: Cannot create kernels.\n"; + return false; + } + return true; } default: buildLog_ += "Error while Codegen phase: the binary is incomplete \n"; @@ -1284,8 +1287,9 @@ bool Program::linkImplLC(amd::option::Options* options) { // Destroy original memory with executable after compilation delete[] executable; - if (!setKernels(options, const_cast(clBinary()->data().first), - clBinary()->data().second)) { + if (!createKernels(const_cast(clBinary()->data().first), clBinary()->data().second, + options->oVariables->UniformWorkGroupSize, internal_)) { + buildLog_ += "Error: Cannot create kernels.\n"; return false; } @@ -1352,17 +1356,17 @@ bool Program::linkImplHSAIL(amd::option::Options* options) { fin_options.append(" -sc-xnack-iommu"); } - if (device().settings().enableWave32Mode_) { - fin_options.append(" -force-wave-size-32"); - } + if (device().settings().enableWave32Mode_) { + fin_options.append(" -force-wave-size-32"); + } - if (device().settings().enableWgpMode_) { - fin_options.append(" -force-wgp-mode"); - } + if (device().settings().enableWgpMode_) { + fin_options.append(" -force-wgp-mode"); + } - if (device().settings().hsailExplicitXnack_) { - fin_options.append(" -xnack"); - } + if (device().settings().hsailExplicitXnack_) { + fin_options.append(" -xnack"); + } errorCode = amd::Hsail::Compile(device().compiler(), binaryElf_, fin_options.c_str(), ACL_TYPE_CG, ACL_TYPE_ISA, logFunction); @@ -1382,8 +1386,8 @@ bool Program::linkImplHSAIL(amd::option::Options* options) { } // Call the device layer to setup all available kernels on the actual device - if (!setKernels(options, binary, binSize)) { - buildLog_ += "Error: Cannot set kernel \n"; + if (!createKernels(binary, binSize, options->oVariables->UniformWorkGroupSize, internal_)) { + buildLog_ += "Error: Cannot create kernel.\n"; return false; } @@ -1772,6 +1776,48 @@ int32_t Program::build(const std::string& sourceCode, const char* origOptions, return buildError(); } +// ================================================================================================ +bool Program::loadHSAIL() { +#if defined(WITH_COMPILER_LIB) + acl_error errorCode; + size_t binSize; + void* bin = const_cast(amd::Hsail::ExtractSection(device().compiler(), binaryElf_, + &binSize, aclTEXT, &errorCode)); + if (errorCode != ACL_SUCCESS) { + LogPrintfError("Error: cannot extract ISA from compiled binary.\n"); + return false; + } + // Call the device layer to setup all available kernels on the actual device + return setKernels(bin, binSize); +#else + return false; +#endif +} + +// ================================================================================================ +bool Program::loadLC() { +#if defined(USE_COMGR_LIBRARY) + return setKernels(const_cast(binary().first), binary().second, + BinaryFd().first, BinaryFd().second, BinaryURI()); +#else + return false; +#endif +} + +// ================================================================================================ +bool Program::load() { + bool ret; + if (isLC()) { + ret = loadLC(); + } else { + ret = loadHSAIL(); + } + if (ret) { + coLoaded_ = 1; + } + return ret; +} + // ================================================================================================ std::vector Program::ProcessOptions(amd::option::Options* options) { std::vector optionsVec; diff --git a/rocclr/device/devprogram.hpp b/rocclr/device/devprogram.hpp index 69ca9aa7cf..be61766335 100644 --- a/rocclr/device/devprogram.hpp +++ b/rocclr/device/devprogram.hpp @@ -123,7 +123,8 @@ class Program : public amd::HeapObject { uint32_t internal_ : 1; //!< Internal blit program uint32_t isLC_ : 1; //!< LC was used for the program compilation uint32_t hasGlobalStores_ : 1; //!< Program has writable program scope variables - uint32_t isHIP_ : 1; //!< Determine if the program is for HIP + uint32_t isHIP_ : 1; //!< Determine if the program is for HIP + uint32_t coLoaded_ : 1; //!< Has the code objected been loaded }; uint32_t flags_; //!< Program flags }; @@ -178,15 +179,18 @@ class Program : public amd::HeapObject { const char** headerIncludeNames, const char* origOptions, amd::option::Options* options); - //! Builds the device program. + //! Link the device program. int32_t link(const std::vector& inputPrograms, const char* origOptions, amd::option::Options* options); - //! Builds the device program. + //! Build the device program. int32_t build(const std::string& sourceCode, const char* origOptions, amd::option::Options* options, const std::vector& preCompiledHeaders); - //! Returns the device object, associated with this program. + //! Load the device program. + bool load(); + + //! Return the device object, associated with this program. const amd::Device& device() const { return device_(); } //! Return the compiler options used to build the program. @@ -248,6 +252,9 @@ class Program : public amd::HeapObject { //! Global variables are a part of the code segment bool hasGlobalStores() const { return hasGlobalStores_; } + //! Return TRUE if the program has been loaded + bool isCodeObjectLoaded() const { return coLoaded_; } + #if defined(USE_COMGR_LIBRARY) amd_comgr_metadata_node_t metadata() const { return metadata_; } @@ -324,9 +331,11 @@ class Program : public amd::HeapObject { //! return target info virtual const aclTargetInfo& info() = 0; #endif + virtual bool createKernels(void* binary, size_t binSize, bool useUniformWorkGroupSize, + bool internalKernel) { return true; } virtual bool setKernels( - amd::option::Options* options, void* binary, size_t binSize, + void* binary, size_t binSize, amd::Os::FileDesc fdesc = amd::Os::FDescInit(), size_t foffset = 0, std::string uri = std::string()) { return true; } @@ -397,6 +406,12 @@ class Program : public amd::HeapObject { //! Link the device program with HSAIL path bool linkImplHSAIL(amd::option::Options* options); + //! Load the device program with LC path + bool loadLC(); + + //! Load the device program with HSAIL path + bool loadHSAIL(); + #if defined(USE_COMGR_LIBRARY) //! Dump the log data object to the build log, if a log data object is present void extractBuildLog(amd_comgr_data_set_t dataSet); diff --git a/rocclr/device/pal/palkernel.cpp b/rocclr/device/pal/palkernel.cpp index 953de42916..2ca8c367e4 100644 --- a/rocclr/device/pal/palkernel.cpp +++ b/rocclr/device/pal/palkernel.cpp @@ -85,15 +85,26 @@ bool HSAILKernel::setKernelCode(amd::hsa::loader::Symbol* sym, amd_kernel_code_t return true; } -bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) { +bool HSAILKernel::aqlCreateHWInfo() { + hsa_agent_t agent = {amd::Device::toHandle(&(device()))}; + std::string openclKernelName = device::Kernel::openclMangledName(name()); + amd::hsa::loader::Symbol* sym = prog().getSymbol(openclKernelName.c_str(), &agent); + if (!sym) { + LogPrintfError("Error: Getting kernel ISA code symbol %s from AMD HSA Code Object failed.\n", + openclKernelName.c_str()); + return false; + } + amd_kernel_code_t* akc = &akc_; if (!setKernelCode(sym, akc)) { + LogPrintfError("Error: setKernelCode() failed.\n"); return false; } if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE, reinterpret_cast(&codeSize_))) { + LogPrintfError("Error: sym->GetInfo() failed.\n"); return false; } @@ -107,45 +118,30 @@ bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) { return true; } -HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions) +HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, bool internalKernel) : device::Kernel(prog->device(), name, *prog), - compileOptions_(compileOptions), index_(0), code_(0), codeSize_(0) { flags_.hsa_ = true; + flags_.internalKernel_ = internalKernel; } HSAILKernel::~HSAILKernel() {} -bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) { +bool HSAILKernel::postLoad() { +#if defined(WITH_COMPILER_LIB) + if (!aqlCreateHWInfo()) { + return false; + } +#endif + return true; +} + +bool HSAILKernel::init() { #if defined(WITH_COMPILER_LIB) acl_error error = ACL_SUCCESS; std::string openClKernelName = openclMangledName(name()); - flags_.internalKernel_ = - (compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false; - // compile kernel down to ISA - if (finalize) { - std::string options(compileOptions_.c_str()); - options.append(" -just-kernel="); - options.append(openClKernelName.c_str()); - // Append an option so that we can selectively enable a SCOption on CZ - // whenever IOMMUv2 is enabled. - if (palNullDevice().settings().svmFineGrainSystem_) { - options.append(" -sc-xnack-iommu"); - } - error = amd::Hsail::Compile(palNullDevice().compiler(), prog().binaryElf(), options.c_str(), ACL_TYPE_CG, - ACL_TYPE_ISA, nullptr); - buildLog_ += amd::Hsail::GetCompilerLog(palNullDevice().compiler()); - if (error != ACL_SUCCESS) { - LogError("Failed to finalize kernel"); - return false; - } - } - - if (!aqlCreateHWInfo(sym)) { - return false; - } // Pull out metadata from the ELF size_t sizeOfArgList; @@ -437,13 +433,10 @@ const LightningProgram& LightningKernel::prog() const { #if defined(USE_COMGR_LIBRARY) bool LightningKernel::init() { - flags_.internalKernel_ = - (compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false; - - if (!GetAttrCodePropMetadata()) { - return false; - } + return GetAttrCodePropMetadata(); +} +bool LightningKernel::postLoad() { if (codeObjectVer() == 2) { symbolName_ = name(); } @@ -451,7 +444,7 @@ bool LightningKernel::init() { // Copy codeobject of this kernel from the program CPU segment hsa_agent_t agent = {amd::Device::toHandle(&(device()))}; - auto sym = prog().GetSymbol(symbolName().c_str(), &agent); + auto sym = prog().getSymbol(symbolName().c_str(), &agent); if (!setKernelCode(sym, &akc_)) { return false; @@ -465,7 +458,7 @@ bool LightningKernel::init() { amd::hsa::loader::Symbol* rth_symbol; // Get the runtime handle symbol GPU address - rth_symbol = prog().GetSymbol(RuntimeHandle().c_str(), &agent); + rth_symbol = prog().getSymbol(RuntimeHandle().c_str(), &agent); uint64_t symbol_address; rth_symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &symbol_address); diff --git a/rocclr/device/pal/palkernel.hpp b/rocclr/device/pal/palkernel.hpp index 0a432062bd..9fc585fd57 100644 --- a/rocclr/device/pal/palkernel.hpp +++ b/rocclr/device/pal/palkernel.hpp @@ -60,13 +60,15 @@ class LightningProgram; */ class HSAILKernel : public device::Kernel { public: - HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions); + HSAILKernel(std::string name, HSAILProgram* prog, bool internalKernel); virtual ~HSAILKernel(); //! Initializes the metadata required for this kernel, - //! finalizes the kernel if needed - bool init(amd::hsa::loader::Symbol* sym, bool finalize = false); + bool init(); + + //! Setup after code object loading + bool postLoad(); //! Returns PAL, possibly null, device object, associated with this kernel. const NullDevice& palNullDevice() const { return reinterpret_cast(dev_); } @@ -122,7 +124,7 @@ class HSAILKernel : public device::Kernel { protected: //! Creates AQL kernel HW info - bool aqlCreateHWInfo(amd::hsa::loader::Symbol* sym); + bool aqlCreateHWInfo(); //! Get the kernel code and copy the code object from the program CPU segment bool setKernelCode(amd::hsa::loader::Symbol* sym, amd_kernel_code_t* akc); @@ -131,7 +133,6 @@ class HSAILKernel : public device::Kernel { void setWorkGroupInfo(const uint32_t privateSegmentSize, const uint32_t groupSegmentSize, const uint16_t numSGPRs, const uint16_t numVGPRs); - std::string compileOptions_; //!< compile used for finalizing this kernel amd_kernel_code_t akc_; //!< AQL kernel code on CPU uint index_; //!< Kernel index in the program @@ -141,18 +142,18 @@ class HSAILKernel : public device::Kernel { class LightningKernel : public HSAILKernel { public: - LightningKernel(const std::string& name, HSAILProgram* prog, const std::string& compileOptions) - : HSAILKernel(name, prog, compileOptions) {} + LightningKernel(const std::string& name, HSAILProgram* prog, bool internalKernel) + : HSAILKernel(name, prog, internalKernel) {} //! Returns Lightning program associated with this kernel const LightningProgram& prog() const; - //! Initializes the metadata required for this kernel, - bool init(amd::hsa::loader::Symbol* symbol); - #if defined(USE_COMGR_LIBRARY) - //! Initializes the metadata required for this kernel, + //! Initializes the metadata required for this kernel bool init(); + + //! Setup after code object loading + bool postLoad(); #endif }; diff --git a/rocclr/device/pal/palprogram.cpp b/rocclr/device/pal/palprogram.cpp index 76bffa9b3a..81329e85c8 100644 --- a/rocclr/device/pal/palprogram.cpp +++ b/rocclr/device/pal/palprogram.cpp @@ -243,7 +243,44 @@ inline static std::vector splitSpaceSeparatedString(char* str) { return vec; } -bool HSAILProgram::setKernels(amd::option::Options* options, void* binary, size_t binSize, +bool HSAILProgram::createKernels(void* binary, size_t binSize, bool useUniformWorkGroupSize, + bool internalKernel) { + size_t kernelNamesSize = 0; + acl_error errorCode = amd::Hsail::QueryInfo(palNullDevice().compiler(), binaryElf_, + RT_KERNEL_NAMES, nullptr, nullptr, &kernelNamesSize); + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Error: Querying of kernel names size from the binary failed.\n"; + return false; + } + if (kernelNamesSize > 0) { + char* kernelNames = new char[kernelNamesSize]; + errorCode = amd::Hsail::QueryInfo(palNullDevice().compiler(), binaryElf_, RT_KERNEL_NAMES, + nullptr, kernelNames, &kernelNamesSize); + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Error: Querying of kernel names from the binary failed.\n"; + delete[] kernelNames; + return false; + } + std::vector vKernels = splitSpaceSeparatedString(kernelNames); + delete[] kernelNames; + for (const auto& it : vKernels) { + std::string kernelName(it); + + HSAILKernel* aKernel = new HSAILKernel(kernelName, this, internalKernel); + kernels()[kernelName] = aKernel; + + if (!aKernel->init()) { + buildLog_ += "Error: Kernel initialization failed.\n"; + return false; + } + + aKernel->setUniformWorkGroupSize(useUniformWorkGroupSize); + } + } + return true; +} + +bool HSAILProgram::setKernels(void* binary, size_t binSize, amd::Os::FileDesc fdesc, size_t foffset, std::string uri) { #if defined(WITH_COMPILER_LIB) // Stop compilation if it is an offline device - PAL runtime does not @@ -275,56 +312,23 @@ bool HSAILProgram::setKernels(amd::option::Options* options, void* binary, size_ return false; } - size_t kernelNamesSize = 0; - acl_error errorCode = amd::Hsail::QueryInfo(palNullDevice().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr, - nullptr, &kernelNamesSize); - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Error: Querying of kernel names size from the binary failed.\n"; - return false; + bool dynamicParallelism = false; + for (auto& kit : kernels()) { + HSAILKernel* aKernel = static_cast(kit.second); + if (!aKernel->postLoad()) { + return false; + } + dynamicParallelism |= aKernel->dynamicParallelism(); + // Find max scratch regs used in the program. It's used for scratch buffer preallocation + // with dynamic parallelism, since runtime doesn't know which child kernel will be called + maxScratchRegs_ = + std::max(static_cast(aKernel->workGroupInfo()->scratchRegs_), maxScratchRegs_); + maxVgprs_ = std::max(static_cast(aKernel->workGroupInfo()->usedVGPRs_), maxVgprs_); } - if (kernelNamesSize > 0) { - char* kernelNames = new char[kernelNamesSize]; - errorCode = amd::Hsail::QueryInfo(palNullDevice().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr, kernelNames, - &kernelNamesSize); - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Error: Querying of kernel names from the binary failed.\n"; - delete[] kernelNames; - return false; - } - std::vector vKernels = splitSpaceSeparatedString(kernelNames); - delete[] kernelNames; - bool dynamicParallelism = false; - for (const auto& it : vKernels) { - std::string kernelName(it); - std::string openclKernelName = device::Kernel::openclMangledName(kernelName); - HSAILKernel* aKernel = - new HSAILKernel(kernelName, this, options->origOptionStr + ProcessOptionsFlattened(options)); - kernels()[kernelName] = aKernel; - - amd::hsa::loader::Symbol* sym = executable_->GetSymbol(openclKernelName.c_str(), &agent); - if (!sym) { - buildLog_ += "Error: Getting kernel ISA code symbol '" + openclKernelName + - "' from AMD HSA Code Object failed. Kernel initialization failed.\n"; - return false; - } - if (!aKernel->init(sym, false)) { - buildLog_ += "Error: Kernel '" + openclKernelName + "' initialization failed.\n"; - return false; - } - buildLog_ += aKernel->buildLog(); - aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize); - dynamicParallelism |= aKernel->dynamicParallelism(); - // Find max scratch regs used in the program. It's used for scratch buffer preallocation - // with dynamic parallelism, since runtime doesn't know which child kernel will be called - maxScratchRegs_ = - std::max(static_cast(aKernel->workGroupInfo()->scratchRegs_), maxScratchRegs_); - maxVgprs_ = std::max(static_cast(aKernel->workGroupInfo()->usedVGPRs_), maxVgprs_); - } - // Allocate kernel table for device enqueuing - if (!isNull() && dynamicParallelism && !allocKernelTable()) { - return false; - } + // Allocate kernel table for device enqueuing + if (!isNull() && dynamicParallelism && !allocKernelTable()) { + return false; } DestroySegmentCpuAccess(); @@ -731,7 +735,34 @@ bool LightningProgram::createBinary(amd::option::Options* options) { return true; } -bool LightningProgram::setKernels(amd::option::Options* options, void* binary, size_t binSize, +bool LightningProgram::createKernels(void* binary, size_t binSize, bool useUniformWorkGroupSize, + bool internalKernel) { +#if defined(USE_COMGR_LIBRARY) + // Find the size of global variables from the binary + if (!FindGlobalVarSize(binary, binSize)) { + buildLog_ += "Error: Cannot Find Global Var Sizes\n"; + return false; + } + + for (const auto& kernelMeta : kernelMetadataMap_) { + auto kernelName = kernelMeta.first; + auto kernel = new LightningKernel(kernelName, this, internalKernel); + if (kernel == nullptr) { + return false; + } + if (!kernel->init()) { + buildLog_ += "[ROC][Kernel] Could not get Code Prop Meta Data \n"; + return false; + } + kernels()[kernelName] = kernel; + + kernel->setUniformWorkGroupSize(useUniformWorkGroupSize); + } +#endif + return true; +} + +bool LightningProgram::setKernels(void* binary, size_t binSize, amd::Os::FileDesc fdesc, size_t foffset, std::string uri) { #if defined(USE_COMGR_LIBRARY) // Stop compilation if it is an offline device - PAL runtime does not @@ -742,7 +773,7 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s executable_ = loader_->CreateExecutable(HSA_PROFILE_FULL, nullptr); if (executable_ == nullptr) { - buildLog_ += "Error: Executable for AMD HSA Code Object isn't created.\n"; + LogPrintfError("Error: Executable for AMD HSA Code Object isn't created.\n"); return false; } @@ -753,33 +784,21 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr); if (status != HSA_STATUS_SUCCESS) { - buildLog_ += "Error: AMD HSA Code Object loading failed.\n"; + LogPrintfError("Error: AMD HSA Code Object loading failed.\n"); return false; } status = executable_->Freeze(nullptr); if (status != HSA_STATUS_SUCCESS) { - buildLog_ += "Error: Freezing the executable failed: "; + LogPrintfError("Error: Freezing the executable failed.\n"); return false; } - // Find the size of global variables from the binary - if (!FindGlobalVarSize(binary, binSize)) { - return false; - } - - for (const auto& kernelMeta : kernelMetadataMap_) { - auto kernelName = kernelMeta.first; - auto kernel = - new LightningKernel(kernelName, this, options->origOptionStr + ProcessOptionsFlattened(options)); - kernels()[kernelName] = kernel; - - if (!kernel->init()) { + for (auto& kit : kernels()) { + LightningKernel* kernel = static_cast(kit.second); + if (!kernel->postLoad()) { return false; } - - kernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize); - // Find max scratch regs used in the program. It's used for scratch buffer preallocation // with dynamic parallelism, since runtime doesn't know which child kernel will be called maxScratchRegs_ = diff --git a/rocclr/device/pal/palprogram.hpp b/rocclr/device/pal/palprogram.hpp index 74eb902f60..dd2a429902 100644 --- a/rocclr/device/pal/palprogram.hpp +++ b/rocclr/device/pal/palprogram.hpp @@ -196,7 +196,7 @@ class HSAILProgram : public device::Program { } //! Get symbol by name - amd::hsa::loader::Symbol* GetSymbol(const char* symbol_name, const hsa_agent_t* agent) const { + amd::hsa::loader::Symbol* getSymbol(const char* symbol_name, const hsa_agent_t* agent) const { return executable_->GetSymbol(symbol_name, agent); } @@ -208,8 +208,10 @@ class HSAILProgram : public device::Program { #if defined(WITH_COMPILER_LIB) virtual const aclTargetInfo& info(); #endif + virtual bool createKernels(void* binary, size_t binSize, bool useUniformWorkGroupSize, + bool internalKernel) override; - virtual bool setKernels(amd::option::Options* options, void* binary, size_t binSize, + virtual bool setKernels(void* binary, size_t binSize, amd::Os::FileDesc fdesc = amd::Os::FDescInit(), size_t foffset = 0, std::string uri = std::string()) override; @@ -267,7 +269,10 @@ class LightningProgram : public HSAILProgram { virtual ~LightningProgram() {} protected: - virtual bool setKernels(amd::option::Options* options, void* binary, size_t binSize, + virtual bool createKernels(void* binary, size_t binSize, bool useUniformWorkGroupSize, + bool internalKernel) override; + + virtual bool setKernels(void* binary, size_t binSize, amd::Os::FileDesc fdesc = amd::Os::FDescInit(), size_t foffset = 0, std::string uri = std::string()) override; diff --git a/rocclr/device/rocm/rockernel.cpp b/rocclr/device/rocm/rockernel.cpp index 58ae759f4e..e9caf329d9 100644 --- a/rocclr/device/rocm/rockernel.cpp +++ b/rocclr/device/rocm/rockernel.cpp @@ -45,11 +45,10 @@ Kernel::Kernel(std::string name, Program* prog) #if defined(USE_COMGR_LIBRARY) bool LightningKernel::init() { - if (!GetAttrCodePropMetadata()) { - LogError("[ROC][Kernel] Could not get Code Prop Meta Data \n"); - return false; - } + return GetAttrCodePropMetadata(); +} +bool LightningKernel::postLoad() { // Set the kernel symbol name and size/alignment based on the kernel metadata // NOTE: kernel name is used to get the kernel code handle in V2, // but kernel symbol name is used in V3 diff --git a/rocclr/device/rocm/rockernel.hpp b/rocclr/device/rocm/rockernel.hpp index e4719c82ee..c6cedd83dc 100644 --- a/rocclr/device/rocm/rockernel.hpp +++ b/rocclr/device/rocm/rockernel.hpp @@ -79,6 +79,9 @@ class LightningKernel : public roc::Kernel { //! Initializes the metadata required for this kernel virtual bool init() final; + + //! Setup after code object loading + bool postLoad(); }; } // namespace roc diff --git a/rocclr/device/rocm/rocprogram.cpp b/rocclr/device/rocm/rocprogram.cpp index 755e094250..f791e449b8 100644 --- a/rocclr/device/rocm/rocprogram.cpp +++ b/rocclr/device/rocm/rocprogram.cpp @@ -219,7 +219,7 @@ bool HSAILProgram::saveBinaryAndSetType(type_t type) { return true; } -bool HSAILProgram::setKernels(amd::option::Options* options, void* binary, size_t binSize, +bool HSAILProgram::setKernels(void* binary, size_t binSize, amd::Os::FileDesc fdesc, size_t foffset, std::string uri) { return true; } @@ -263,7 +263,28 @@ bool LightningProgram::saveBinaryAndSetType(type_t type, void* rawBinary, size_t return true; } -bool LightningProgram::setKernels(amd::option::Options* options, void* binary, size_t binSize, +bool LightningProgram::createKernels(void* binary, size_t binSize, bool useUniformWorkGroupSize, + bool internalKernel) { + // Find the size of global variables from the binary + if (!FindGlobalVarSize(binary, binSize)) { + buildLog_ += "Error: Cannot Find Global Var Sizes\n"; + return false; + } + + for (const auto &kernelMeta : kernelMetadataMap_) { + const std::string kernelName = kernelMeta.first; + Kernel* aKernel = new roc::LightningKernel(kernelName, this); + if (!aKernel->init()) { + return false; + } + aKernel->setUniformWorkGroupSize(useUniformWorkGroupSize); + aKernel->setInternalKernelFlag(internalKernel); + kernels()[kernelName] = aKernel; + } + return true; +} + +bool LightningProgram::setKernels(void* binary, size_t binSize, amd::Os::FileDesc fdesc, size_t foffset, std::string uri) { #if defined(USE_COMGR_LIBRARY) // Stop compilation if it is an offline device - HSA runtime does not @@ -272,13 +293,6 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s return true; } - // Find the size of global variables from the binary - if (!FindGlobalVarSize(binary, binSize)) { - buildLog_ += "Error: Cannot Global Var Sizes "; - buildLog_ += "\n"; - return false; - } - hsa_agent_t agent = rocDevice().getBackendDevice(); hsa_status_t status; @@ -320,16 +334,11 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s return false; } - for (const auto &kernelMeta : kernelMetadataMap_) { - const std::string kernelName = kernelMeta.first; - Kernel* aKernel = new roc::LightningKernel(kernelName, this); - if (!aKernel->init()) { + for (auto& kit : kernels()) { + LightningKernel* kernel = static_cast(kit.second); + if (!kernel->postLoad()) { return false; } - aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize); - aKernel->setInternalKernelFlag(compileOptions_.find("-cl-internal-kernel") != - std::string::npos); - kernels()[kernelName] = aKernel; } #endif // defined(USE_COMGR_LIBRARY) return true; diff --git a/rocclr/device/rocm/rocprogram.hpp b/rocclr/device/rocm/rocprogram.hpp index 6ed31dab0f..0776df6898 100644 --- a/rocclr/device/rocm/rocprogram.hpp +++ b/rocclr/device/rocm/rocprogram.hpp @@ -94,7 +94,7 @@ class HSAILProgram : public roc::Program { protected: bool createBinary(amd::option::Options* options) override { return true; } - virtual bool setKernels(amd::option::Options* options, void* binary, size_t binSize, + virtual bool setKernels(void* binary, size_t binSize, amd::Os::FileDesc fdesc = amd::Os::FDescInit(), size_t foffset = 0, std::string uri = std::string()) override; @@ -117,9 +117,12 @@ protected: private: bool saveBinaryAndSetType(type_t type, void* rawBinary, size_t size); - bool setKernels(amd::option::Options* options, void* binary, size_t binSize, + bool createKernels(void* binary, size_t binSize, bool useUniformWorkGroupSize, + bool internalKernel) override final; + + bool setKernels(void* binary, size_t binSize, amd::Os::FileDesc fdesc = amd::Os::FDescInit(), size_t foffset = 0, - std::string uri = std::string()) final; + std::string uri = std::string()) override final; }; /*@}*/} // namespace roc diff --git a/rocclr/platform/program.cpp b/rocclr/platform/program.cpp index fc2b518402..d31a8fe182 100644 --- a/rocclr/platform/program.cpp +++ b/rocclr/platform/program.cpp @@ -622,6 +622,28 @@ int32_t Program::build(const std::vector& devices, const char* options, } bool Program::load(const std::vector& devices) { + ScopedLock sl(buildLock_); + + for (const auto& it : devicePrograms_) { + const Device& device = *(it.first); + + // If devices is specified, only load code object for those devices + if (std::find(devices.begin(), devices.end(), &device) != devices.end()) { + continue; + } + + device::Program& devProgram = *(it.second); + + // Only load the code object once + if (devProgram.isCodeObjectLoaded()) { + continue; + } + + if (!devProgram.load()) { + return false; + } + } + return true; }