ea28025939
Report proper target id for xnack in HSAIL path. Runtime
will use ISA table and report hsailName().
Fix offline compilation path for PAL.
Change-Id: Ic0250bf6b9c193d867aec9800a319da1bf00c3ee
[ROCm/clr commit: a543d4a860]
807 строки
26 KiB
C++
807 строки
26 KiB
C++
/* Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE. */
|
|
|
|
#include "os/os.hpp"
|
|
#include "utils/flags.hpp"
|
|
#include "aclTypes.h"
|
|
#include "device/pal/palprogram.hpp"
|
|
#include "device/pal/palblit.hpp"
|
|
#include "utils/options.hpp"
|
|
#include "hsa.h"
|
|
#include "hsa_ext_image.h"
|
|
#include "amd_hsa_loader.hpp"
|
|
|
|
#include <algorithm>
|
|
#include <cstdio>
|
|
#include <fstream>
|
|
#include <memory>
|
|
#include <iterator>
|
|
#include <sstream>
|
|
|
|
namespace pal {
|
|
|
|
Segment::Segment() : gpuAccess_(nullptr), cpuAccess_(nullptr), cpuMem_(nullptr) {}
|
|
|
|
Segment::~Segment() {
|
|
if (gpuAccess_ != nullptr) {
|
|
gpuAccess_->owner()->release();
|
|
}
|
|
DestroyCpuAccess();
|
|
}
|
|
|
|
void Segment::DestroyCpuAccess() {
|
|
if (cpuAccess_ != nullptr) {
|
|
cpuAccess_->unmap(nullptr);
|
|
delete cpuAccess_;
|
|
cpuAccess_ = nullptr;
|
|
}
|
|
if (cpuMem_ != nullptr) {
|
|
delete[] cpuMem_;
|
|
cpuMem_ = nullptr;
|
|
}
|
|
}
|
|
|
|
bool Segment::gpuAddressOffset(uint64_t offAddr, size_t* offset) {
|
|
uint64_t baseAddr = gpuAccess_->vmAddress();
|
|
if (baseAddr > offAddr) {
|
|
return false;
|
|
}
|
|
*offset = (offAddr - baseAddr);
|
|
return true;
|
|
}
|
|
|
|
bool Segment::alloc(HSAILProgram& prog, amdgpu_hsa_elf_segment_t segment, size_t size, size_t align,
|
|
bool zero) {
|
|
if (prog.isNull()) {
|
|
LogError("[OCL] cannot create a mem object on an offline device!");
|
|
return false;
|
|
}
|
|
|
|
align = amd::alignUp(align, sizeof(uint32_t));
|
|
|
|
amd::Memory* amd_mem_obj = new (prog.palDevice().context())
|
|
amd::Buffer(prog.palDevice().context(), 0, amd::alignUp(size, align),
|
|
// HIP requires SVM allocation for segment code due to possible global variable
|
|
// access and global variables are a part of code segment with the latest loader
|
|
amd::IS_HIP ? reinterpret_cast<void*>(1) : nullptr);
|
|
|
|
if (amd_mem_obj == nullptr) {
|
|
LogError("[OCL] failed to create a mem object!");
|
|
return false;
|
|
}
|
|
|
|
if (!amd_mem_obj->create(nullptr)) {
|
|
LogError("[OCL] failed to create a svm hidden buffer!");
|
|
amd_mem_obj->release();
|
|
return false;
|
|
}
|
|
|
|
gpuAccess_ = static_cast<pal::Memory*>(amd_mem_obj->getDeviceMemory(prog.palDevice(), false));
|
|
|
|
if (segment == AMDGPU_HSA_SEGMENT_CODE_AGENT) {
|
|
void* ptr = nullptr;
|
|
cpuAccess_ = new pal::Memory(prog.palDevice(), amd::alignUp(size, align));
|
|
if ((cpuAccess_ == nullptr) || !cpuAccess_->create(pal::Resource::Remote)) {
|
|
delete cpuAccess_;
|
|
cpuAccess_ = nullptr;
|
|
ptr = cpuMem_ = reinterpret_cast<address>(new char[amd::alignUp(size, align)]);
|
|
if (cpuMem_ == nullptr) {
|
|
return false;
|
|
}
|
|
} else {
|
|
ptr = cpuAccess_->map(nullptr, 0);
|
|
}
|
|
if (zero) {
|
|
memset(ptr, 0, size);
|
|
}
|
|
}
|
|
|
|
// Don't clear GPU memory if CPU backing store is available
|
|
if ((cpuAccess_ == nullptr) && zero && !prog.isInternal()) {
|
|
uint64_t pattern = 0;
|
|
size_t patternSize = ((size % sizeof(pattern)) == 0) ? sizeof(pattern) : 1;
|
|
prog.palDevice().xferMgr().fillBuffer(*gpuAccess_, &pattern, patternSize, amd::Coord3D(0),
|
|
amd::Coord3D(size));
|
|
}
|
|
|
|
switch (segment) {
|
|
case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
|
|
case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT:
|
|
case AMDGPU_HSA_SEGMENT_READONLY_AGENT:
|
|
prog.addGlobalStore(gpuAccess_);
|
|
prog.setGlobalVariableTotalSize(prog.globalVariableTotalSize() + size);
|
|
break;
|
|
case AMDGPU_HSA_SEGMENT_CODE_AGENT:
|
|
prog.setCodeObjects(this, gpuAccess_, reinterpret_cast<address>(cpuAddress(0)));
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void Segment::copy(size_t offset, const void* src, size_t size) {
|
|
if (cpuAccess_ != nullptr) {
|
|
amd::Os::fastMemcpy(cpuAddress(offset), src, size);
|
|
} else {
|
|
if (cpuMem_ != nullptr) {
|
|
amd::Os::fastMemcpy(cpuAddress(offset), src, size);
|
|
}
|
|
amd::ScopedLock k(gpuAccess_->dev().xferMgr().lockXfer());
|
|
VirtualGPU& gpu = *gpuAccess_->dev().xferQueue();
|
|
Memory& xferBuf = gpu.xferWrite().Acquire(size);
|
|
size_t tmpSize = std::min(static_cast<size_t>(xferBuf.size()), size);
|
|
size_t srcOffs = 0;
|
|
while (size != 0) {
|
|
xferBuf.hostWrite(&gpu, reinterpret_cast<const_address>(src) + srcOffs, 0, tmpSize);
|
|
xferBuf.partialMemCopyTo(gpu, 0, (offset + srcOffs), tmpSize, *gpuAccess_, false, true);
|
|
size -= tmpSize;
|
|
srcOffs += tmpSize;
|
|
tmpSize = std::min(static_cast<size_t>(xferBuf.size()), size);
|
|
}
|
|
gpu.xferWrite().Release(xferBuf);
|
|
gpu.waitAllEngines();
|
|
}
|
|
}
|
|
|
|
bool Segment::freeze(bool destroySysmem) {
|
|
VirtualGPU& gpu = *gpuAccess_->dev().xferQueue();
|
|
bool result = true;
|
|
if (cpuAccess_ != nullptr) {
|
|
assert(gpuAccess_->size() == cpuAccess_->size() && "Backing store size mismatch!");
|
|
amd::ScopedLock k(gpuAccess_->dev().xferMgr().lockXfer());
|
|
result = cpuAccess_->partialMemCopyTo(gpu, 0, 0, gpuAccess_->size(), *gpuAccess_, false, true);
|
|
gpu.waitAllEngines();
|
|
}
|
|
assert(!destroySysmem || (cpuAccess_ == nullptr));
|
|
return result;
|
|
}
|
|
|
|
HSAILProgram::HSAILProgram(Device& device, amd::Program& owner)
|
|
: Program(device, owner),
|
|
rawBinary_(nullptr),
|
|
kernels_(nullptr),
|
|
codeSegGpu_(nullptr),
|
|
codeSegment_(nullptr),
|
|
maxScratchRegs_(0),
|
|
maxVgprs_(0),
|
|
executable_(nullptr),
|
|
loaderContext_(this) {
|
|
assert(device.isOnline());
|
|
loader_ = amd::hsa::loader::Loader::Create(&loaderContext_);
|
|
}
|
|
|
|
HSAILProgram::HSAILProgram(NullDevice& device, amd::Program& owner)
|
|
: Program(device, owner),
|
|
rawBinary_(nullptr),
|
|
kernels_(nullptr),
|
|
codeSegGpu_(nullptr),
|
|
codeSegment_(nullptr),
|
|
maxScratchRegs_(0),
|
|
maxVgprs_(0),
|
|
executable_(nullptr),
|
|
loaderContext_(this) {
|
|
assert(!device.isOnline());
|
|
isNull_ = true;
|
|
loader_ = amd::hsa::loader::Loader::Create(&loaderContext_);
|
|
}
|
|
|
|
HSAILProgram::~HSAILProgram() {
|
|
// Destroy internal static samplers
|
|
for (auto& it : staticSamplers_) {
|
|
delete it;
|
|
}
|
|
#if defined(WITH_COMPILER_LIB)
|
|
if (rawBinary_ != nullptr) {
|
|
amd::Hsail::FreeMem(binaryElf_, rawBinary_);
|
|
}
|
|
acl_error error;
|
|
// Free the elf binary
|
|
if (binaryElf_ != nullptr) {
|
|
error = amd::Hsail::BinaryFini(binaryElf_);
|
|
if (error != ACL_SUCCESS) {
|
|
LogWarning("Error while destroying the acl binary \n");
|
|
}
|
|
}
|
|
#endif // defined(WITH_COMPILER_LIB)
|
|
releaseClBinary();
|
|
if (executable_) {
|
|
loader_->DestroyExecutable(executable_);
|
|
}
|
|
|
|
delete kernels_;
|
|
|
|
if (loader_) {
|
|
amd::hsa::loader::Loader::Destroy(loader_);
|
|
}
|
|
}
|
|
|
|
|
|
inline static std::vector<std::string> splitSpaceSeparatedString(char* str) {
|
|
std::string s(str);
|
|
std::stringstream ss(s);
|
|
std::istream_iterator<std::string> beg(ss), end;
|
|
std::vector<std::string> vec(beg, end);
|
|
return vec;
|
|
}
|
|
|
|
bool HSAILProgram::createKernels(void* binary, size_t binSize, bool useUniformWorkGroupSize,
|
|
bool internalKernel) {
|
|
#if defined(WITH_COMPILER_LIB)
|
|
// ACL_TYPE_CG stage is not performed for offline compilation
|
|
executable_ = loader_->CreateExecutable(HSA_PROFILE_FULL, nullptr);
|
|
if (executable_ == nullptr) {
|
|
buildLog_ += "Error: Executable for AMD HSA Code Object isn't created.\n";
|
|
return false;
|
|
}
|
|
size_t size = binSize;
|
|
hsa_code_object_t code_object;
|
|
code_object.handle = reinterpret_cast<uint64_t>(binary);
|
|
|
|
hsa_agent_t agent = {amd::Device::toHandle(&(device()))};
|
|
hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr);
|
|
if (status != HSA_STATUS_SUCCESS) {
|
|
buildLog_ += "Error: AMD HSA Code Object loading failed.\n";
|
|
return false;
|
|
}
|
|
status = executable_->Freeze(nullptr);
|
|
if (status != HSA_STATUS_SUCCESS) {
|
|
buildLog_ += "Error: AMD HSA Code Object freeze failed.\n";
|
|
return false;
|
|
}
|
|
|
|
size_t kernelNamesSize = 0;
|
|
acl_error errorCode = amd::Hsail::QueryInfo(palNullDevice().compiler(), binaryElf_,
|
|
RT_KERNEL_NAMES, nullptr, nullptr, &kernelNamesSize);
|
|
if (errorCode != ACL_SUCCESS) {
|
|
buildLog_ += "Error: Querying of kernel names size from the binary failed.\n";
|
|
return false;
|
|
}
|
|
if (kernelNamesSize > 0) {
|
|
std::vector<char> kernelNames(kernelNamesSize);
|
|
errorCode = amd::Hsail::QueryInfo(palNullDevice().compiler(), binaryElf_, RT_KERNEL_NAMES,
|
|
nullptr, kernelNames.data(), &kernelNamesSize);
|
|
if (errorCode != ACL_SUCCESS) {
|
|
buildLog_ += "Error: Querying of kernel names from the binary failed.\n";
|
|
return false;
|
|
}
|
|
std::vector<std::string> vKernels = splitSpaceSeparatedString(kernelNames.data());
|
|
for (const auto& it : vKernels) {
|
|
std::string kernelName(it);
|
|
|
|
HSAILKernel* aKernel = new HSAILKernel(kernelName, this, internalKernel);
|
|
kernels()[kernelName] = aKernel;
|
|
|
|
if (!aKernel->init()) {
|
|
buildLog_ += "Error: Kernel initialization failed.\n";
|
|
return false;
|
|
}
|
|
|
|
aKernel->setUniformWorkGroupSize(useUniformWorkGroupSize);
|
|
}
|
|
}
|
|
|
|
DestroySegmentCpuAccess();
|
|
#endif // defined(WITH_COMPILER_LIB)
|
|
return true;
|
|
}
|
|
|
|
bool HSAILProgram::setKernels(void* binary, size_t binSize,
|
|
amd::Os::FileDesc fdesc, size_t foffset, std::string uri) {
|
|
#if defined(WITH_COMPILER_LIB)
|
|
if (!device().isOnline()) {
|
|
return true;
|
|
}
|
|
|
|
bool dynamicParallelism = false;
|
|
for (auto& kit : kernels()) {
|
|
HSAILKernel* aKernel = static_cast<HSAILKernel*>(kit.second);
|
|
if (!aKernel->postLoad()) {
|
|
return false;
|
|
}
|
|
dynamicParallelism |= aKernel->dynamicParallelism();
|
|
// Find max scratch regs used in the program. It's used for scratch buffer preallocation
|
|
// with dynamic parallelism, since runtime doesn't know which child kernel will be called
|
|
maxScratchRegs_ =
|
|
std::max(static_cast<uint>(aKernel->workGroupInfo()->scratchRegs_), maxScratchRegs_);
|
|
maxVgprs_ = std::max(static_cast<uint>(aKernel->workGroupInfo()->usedVGPRs_), maxVgprs_);
|
|
}
|
|
|
|
// Allocate kernel table for device enqueuing
|
|
if (!isNull() && dynamicParallelism && !allocKernelTable()) {
|
|
return false;
|
|
}
|
|
#endif // defined(WITH_COMPILER_LIB)
|
|
return true;
|
|
}
|
|
|
|
bool HSAILProgram::createBinary(amd::option::Options* options) { return true; }
|
|
|
|
bool HSAILProgram::allocKernelTable() {
|
|
if (isNull()) {
|
|
// Cannot create a kernel table for offline devices.
|
|
return false;
|
|
}
|
|
|
|
uint size = kernels().size() * sizeof(size_t);
|
|
|
|
kernels_ = new pal::Memory(palDevice(), size);
|
|
// Initialize kernel table
|
|
if ((kernels_ == nullptr) || !kernels_->create(Resource::RemoteUSWC)) {
|
|
delete kernels_;
|
|
return false;
|
|
} else {
|
|
size_t* table = reinterpret_cast<size_t*>(kernels_->map(nullptr, pal::Resource::WriteOnly));
|
|
for (auto& it : kernels()) {
|
|
HSAILKernel* kernel = static_cast<HSAILKernel*>(it.second);
|
|
table[kernel->index()] = static_cast<size_t>(kernel->gpuAqlCode());
|
|
}
|
|
kernels_->unmap(nullptr);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void HSAILProgram::fillResListWithKernels(VirtualGPU& gpu) const { gpu.addVmMemory(&codeSegGpu()); }
|
|
|
|
#if defined(WITH_COMPILER_LIB)
|
|
const aclTargetInfo& HSAILProgram::info() {
|
|
acl_error err;
|
|
info_ = amd::Hsail::GetTargetInfo(palNullDevice().settings().use64BitPtr_ ? "hsail64" : "hsail",
|
|
device().isa().hsailName(), &err);
|
|
if (err != ACL_SUCCESS) {
|
|
LogWarning("aclGetTargetInfo failed");
|
|
}
|
|
return info_;
|
|
}
|
|
#endif
|
|
|
|
bool HSAILProgram::saveBinaryAndSetType(type_t type) {
|
|
#if defined(WITH_COMPILER_LIB)
|
|
// Write binary to memory
|
|
if (rawBinary_ != nullptr) {
|
|
// Free memory containing rawBinary
|
|
amd::Hsail::FreeMem(binaryElf_, rawBinary_);
|
|
rawBinary_ = nullptr;
|
|
}
|
|
size_t size = 0;
|
|
if (amd::Hsail::WriteToMem(binaryElf_, &rawBinary_, &size) != ACL_SUCCESS) {
|
|
buildLog_ += "Failed to write binary to memory \n";
|
|
return false;
|
|
}
|
|
setBinary(static_cast<char*>(rawBinary_), size);
|
|
// Set the type of binary
|
|
setType(type);
|
|
#endif // defined(WITH_COMPILER_LIB)
|
|
return true;
|
|
}
|
|
|
|
bool HSAILProgram::defineGlobalVar(const char* name, void* dptr) {
|
|
if (!device().isOnline()) {
|
|
return false;
|
|
}
|
|
|
|
hsa_agent_t agent = {amd::Device::toHandle(&(device()))};
|
|
|
|
hsa_status_t hsa_status =
|
|
executable_->DefineAgentExternalVariable(name, agent, HSA_VARIABLE_SEGMENT_GLOBAL, dptr);
|
|
if (HSA_STATUS_SUCCESS != hsa_status) {
|
|
buildLog_ += "Could not define Program External Variable";
|
|
buildLog_ += "\n";
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool HSAILProgram::createGlobalVarObj(amd::Memory** amd_mem_obj, void** device_pptr, size_t* bytes,
|
|
const char* global_name) const {
|
|
if (!device().isOnline()) {
|
|
return false;
|
|
}
|
|
|
|
uint32_t length = 0;
|
|
size_t offset = 0;
|
|
uint32_t flags = 0;
|
|
amd::Memory* parent = nullptr;
|
|
hsa_symbol_kind_t type;
|
|
hsa_status_t status = HSA_STATUS_SUCCESS;
|
|
amd::hsa::loader::Symbol* symbol = nullptr;
|
|
|
|
if (amd_mem_obj == nullptr) {
|
|
buildLog_ += "amd_mem_obj is null";
|
|
buildLog_ += "\n";
|
|
return false;
|
|
}
|
|
|
|
hsa_agent_t agent = {amd::Device::toHandle(&(device()))};
|
|
|
|
/* Retrieve the Symbol obj from global name*/
|
|
symbol = executable_->GetSymbol(global_name, &agent);
|
|
if (!symbol) {
|
|
buildLog_ += "Error: Getting Global Var Symbol";
|
|
buildLog_ += "\n";
|
|
return false;
|
|
}
|
|
|
|
/* Retrieve the symbol type */
|
|
if (!symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &type)) {
|
|
buildLog_ += "Error: Getting Global Var Symbol Type";
|
|
buildLog_ += "\n";
|
|
return false;
|
|
}
|
|
|
|
/* Make sure the symbol is of type VARIABLE */
|
|
if (type != HSA_SYMBOL_KIND_VARIABLE) {
|
|
buildLog_ += "Error: Retrieve Symbol type is not Variable ";
|
|
buildLog_ += "\n";
|
|
return false;
|
|
}
|
|
|
|
/* Retrieve the symbol Name Length */
|
|
if (!symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &length)) {
|
|
buildLog_ += "Error: Getting Global Var Symbol length";
|
|
buildLog_ += "\n";
|
|
return false;
|
|
}
|
|
|
|
/* Retrieve the symbol name */
|
|
char* name = reinterpret_cast<char*>(alloca(length + 1));
|
|
if (!symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_NAME, name)) {
|
|
buildLog_ += "Error: Getting Global Var Symbol name";
|
|
buildLog_ += "\n";
|
|
return false;
|
|
}
|
|
name[length] = '\0';
|
|
|
|
/* Make sure the name matches with the global name */
|
|
if (std::string(name) != std::string(global_name)) {
|
|
buildLog_ += "Error: Global Var Name mismatch";
|
|
buildLog_ += "\n";
|
|
return false;
|
|
}
|
|
|
|
/* Retrieve the Symbol address */
|
|
if (!symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, device_pptr)) {
|
|
buildLog_ += "Error: Getting Global Var Symbol Address";
|
|
buildLog_ += "\n";
|
|
return false;
|
|
}
|
|
|
|
/* Retrieve the Symbol size */
|
|
if (!symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, bytes)) {
|
|
buildLog_ += "Error: Getting Global Var Symbol size";
|
|
buildLog_ += "\n";
|
|
return false;
|
|
}
|
|
|
|
/* Retrieve the Offset from global pal::Memory created @ segment::alloc */
|
|
if (!codeSegment_->gpuAddressOffset(reinterpret_cast<uint64_t>(*device_pptr), &offset)) {
|
|
buildLog_ += "Error: Cannot Retrieve the Address Offset";
|
|
buildLog_ += "\n";
|
|
return false;
|
|
}
|
|
|
|
/* Create a View from the global pal::Memory */
|
|
flags = ROCCLR_MEM_INTERNAL_MEMORY;
|
|
parent = codeSegGpu_->owner();
|
|
*amd_mem_obj = new (parent->getContext()) amd::Buffer(*parent, flags, offset, *bytes);
|
|
|
|
if (*amd_mem_obj == nullptr) {
|
|
buildLog_ += "[OCL] Failed to create a mem object!";
|
|
buildLog_ += "\n";
|
|
return false;
|
|
}
|
|
|
|
if (!((*amd_mem_obj)->create(nullptr))) {
|
|
buildLog_ += "[OCL] failed to create a svm hidden buffer!";
|
|
buildLog_ += "\n";
|
|
(*amd_mem_obj)->release();
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
hsa_isa_t PALHSALoaderContext::IsaFromName(const char* name) {
|
|
const amd::Isa* isa_p = amd::Isa::findIsa(name);
|
|
return {amd::Isa::toHandle(isa_p)};
|
|
}
|
|
|
|
bool PALHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) {
|
|
// The HSA loader uses a handle value of 0 to indicate the ISA is invalid.
|
|
const amd::Isa* code_object_isa_p = amd::Isa::fromHandle(isa.handle);
|
|
if (!code_object_isa_p || !code_object_isa_p->runtimePalSupported()) {
|
|
// The ISA is either not supported because PALHSALoaderContext::IsaFromName
|
|
// could not find it, or the PAL runtime does not support it.
|
|
return false;
|
|
}
|
|
return amd::Isa::isCompatible(*code_object_isa_p, program_->device().isa());
|
|
}
|
|
|
|
void* PALHSALoaderContext::SegmentAlloc(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent,
|
|
size_t size, size_t align, bool zero) {
|
|
assert(size);
|
|
assert(align);
|
|
if (program_->isNull()) {
|
|
// Note: In Linux ::posix_memalign() requires at least 16 bytes for the alignment.
|
|
align = amd::alignUp(align, 16);
|
|
void* ptr = amd::Os::alignedMalloc(size, align);
|
|
if (ptr && zero) {
|
|
memset(ptr, 0, size);
|
|
}
|
|
return ptr;
|
|
}
|
|
|
|
std::unique_ptr<Segment> seg(new Segment());
|
|
if (!seg || !seg->alloc(*program_, segment, size, align, zero)) {
|
|
return nullptr;
|
|
}
|
|
return seg.release();
|
|
}
|
|
|
|
bool PALHSALoaderContext::SegmentCopy(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent,
|
|
void* dst, size_t offset, const void* src, size_t size) {
|
|
if (program_->isNull()) {
|
|
amd::Os::fastMemcpy(reinterpret_cast<address>(dst) + offset, src, size);
|
|
return true;
|
|
}
|
|
Segment* s = reinterpret_cast<Segment*>(dst);
|
|
s->copy(offset, src, size);
|
|
return true;
|
|
}
|
|
|
|
void PALHSALoaderContext::SegmentFree(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent,
|
|
void* seg, size_t size) {
|
|
if (program_->isNull()) {
|
|
amd::Os::alignedFree(seg);
|
|
} else {
|
|
Segment* s = reinterpret_cast<Segment*>(seg);
|
|
delete s;
|
|
}
|
|
}
|
|
|
|
void* PALHSALoaderContext::SegmentAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent,
|
|
void* seg, size_t offset) {
|
|
assert(seg);
|
|
if (program_->isNull()) {
|
|
return (reinterpret_cast<address>(seg) + offset);
|
|
}
|
|
Segment* s = reinterpret_cast<Segment*>(seg);
|
|
return reinterpret_cast<void*>(s->gpuAddress(offset));
|
|
}
|
|
|
|
void* PALHSALoaderContext::SegmentHostAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent,
|
|
void* seg, size_t offset) {
|
|
assert(seg);
|
|
if (program_->isNull()) {
|
|
return (reinterpret_cast<address>(seg) + offset);
|
|
}
|
|
Segment* s = reinterpret_cast<Segment*>(seg);
|
|
return s->cpuAddress(offset);
|
|
}
|
|
|
|
bool PALHSALoaderContext::SegmentFreeze(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent,
|
|
void* seg, size_t size) {
|
|
assert(seg);
|
|
if (program_->isNull()) {
|
|
return true;
|
|
}
|
|
Segment* s = reinterpret_cast<Segment*>(seg);
|
|
return s->freeze((segment == AMDGPU_HSA_SEGMENT_CODE_AGENT) ? false : true);
|
|
}
|
|
|
|
hsa_status_t PALHSALoaderContext::SamplerCreate(
|
|
hsa_agent_t agent, const hsa_ext_sampler_descriptor_t* sampler_descriptor,
|
|
hsa_ext_sampler_t* sampler_handle) {
|
|
if (!agent.handle) {
|
|
return HSA_STATUS_ERROR_INVALID_AGENT;
|
|
}
|
|
if (!sampler_descriptor || !sampler_handle) {
|
|
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
|
}
|
|
|
|
if (program_->isNull()) {
|
|
// Offline compilation. Provide a fake non-null handle.
|
|
sampler_handle->handle = 1;
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
uint32_t state = 0;
|
|
switch (sampler_descriptor->coordinate_mode) {
|
|
case HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED:
|
|
state = amd::Sampler::StateNormalizedCoordsFalse;
|
|
break;
|
|
case HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED:
|
|
state = amd::Sampler::StateNormalizedCoordsTrue;
|
|
break;
|
|
default:
|
|
assert(false);
|
|
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
|
}
|
|
switch (sampler_descriptor->filter_mode) {
|
|
case HSA_EXT_SAMPLER_FILTER_MODE_NEAREST:
|
|
state |= amd::Sampler::StateFilterNearest;
|
|
break;
|
|
case HSA_EXT_SAMPLER_FILTER_MODE_LINEAR:
|
|
state |= amd::Sampler::StateFilterLinear;
|
|
break;
|
|
default:
|
|
assert(false);
|
|
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
|
}
|
|
switch (sampler_descriptor->address_mode) {
|
|
case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE:
|
|
state |= amd::Sampler::StateAddressClampToEdge;
|
|
break;
|
|
case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER:
|
|
state |= amd::Sampler::StateAddressClamp;
|
|
break;
|
|
case HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT:
|
|
state |= amd::Sampler::StateAddressRepeat;
|
|
break;
|
|
case HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT:
|
|
state |= amd::Sampler::StateAddressMirroredRepeat;
|
|
break;
|
|
case HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED:
|
|
state |= amd::Sampler::StateAddressNone;
|
|
break;
|
|
default:
|
|
assert(false);
|
|
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
|
}
|
|
std::unique_ptr<pal::Sampler> sampler(new pal::Sampler(program_->palDevice()));
|
|
if (!sampler || !sampler->create(state)) {
|
|
return HSA_STATUS_ERROR;
|
|
}
|
|
sampler_handle->handle = sampler->hwSrd();
|
|
program_->addSampler(sampler.release());
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
hsa_status_t PALHSALoaderContext::SamplerDestroy(hsa_agent_t agent,
|
|
hsa_ext_sampler_t sampler_handle) {
|
|
if (!agent.handle) {
|
|
return HSA_STATUS_ERROR_INVALID_AGENT;
|
|
}
|
|
if (!sampler_handle.handle) {
|
|
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
|
}
|
|
// Samplers will be destroyed by the pal::HSAILProgam destructor.
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
#if defined(USE_COMGR_LIBRARY)
|
|
|
|
static hsa_status_t GetKernelNamesCallback(hsa_executable_t hExec, hsa_executable_symbol_t hSymbol,
|
|
void* data) {
|
|
auto symbol = Symbol::Object(hSymbol);
|
|
auto symbolNameList = reinterpret_cast<std::vector<std::string>*>(data);
|
|
|
|
hsa_symbol_kind_t type;
|
|
if (!symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &type)) {
|
|
return HSA_STATUS_ERROR;
|
|
}
|
|
|
|
if (type == HSA_SYMBOL_KIND_KERNEL) {
|
|
uint32_t length;
|
|
if (!symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &length)) {
|
|
return HSA_STATUS_ERROR;
|
|
}
|
|
|
|
char* name = reinterpret_cast<char*>(alloca(length + 1));
|
|
if (!symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_NAME, name)) {
|
|
return HSA_STATUS_ERROR;
|
|
}
|
|
name[length] = '\0';
|
|
|
|
symbolNameList->push_back(std::string(name));
|
|
}
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
#endif // defined(USE_COMGR_LIBRARY)
|
|
|
|
bool LightningProgram::createBinary(amd::option::Options* options) {
|
|
#if defined(USE_COMGR_LIBRARY)
|
|
if (!clBinary()->createElfBinary(options->oVariables->BinEncrypt, type())) {
|
|
LogError("Failed to create ELF binary image!");
|
|
return false;
|
|
}
|
|
#endif // defined(USE_COMGR_LIBRARY)
|
|
return true;
|
|
}
|
|
|
|
bool LightningProgram::createKernels(void* binary, size_t binSize, bool useUniformWorkGroupSize,
|
|
bool internalKernel) {
|
|
#if defined(USE_COMGR_LIBRARY)
|
|
// Find the size of global variables from the binary
|
|
if (!FindGlobalVarSize(binary, binSize)) {
|
|
buildLog_ += "Error: Cannot Find Global Var Sizes\n";
|
|
return false;
|
|
}
|
|
|
|
for (const auto& kernelMeta : kernelMetadataMap_) {
|
|
auto kernelName = kernelMeta.first;
|
|
auto kernel = new LightningKernel(kernelName, this, internalKernel);
|
|
if (kernel == nullptr) {
|
|
return false;
|
|
}
|
|
if (!kernel->init()) {
|
|
buildLog_ += "[ROC][Kernel] Could not get Code Prop Meta Data \n";
|
|
return false;
|
|
}
|
|
kernels()[kernelName] = kernel;
|
|
|
|
kernel->setUniformWorkGroupSize(useUniformWorkGroupSize);
|
|
}
|
|
|
|
executable_ = loader_->CreateExecutable(HSA_PROFILE_FULL, nullptr);
|
|
if (executable_ == nullptr) {
|
|
LogError("Error: Executable for AMD HSA Code Object isn't created.");
|
|
return false;
|
|
}
|
|
|
|
hsa_code_object_t code_object;
|
|
code_object.handle = reinterpret_cast<uint64_t>(binary);
|
|
|
|
hsa_agent_t agent = {amd::Device::toHandle(&(device()))};
|
|
|
|
hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr);
|
|
if (status != HSA_STATUS_SUCCESS) {
|
|
LogError("Error: AMD HSA Code Object loading failed.");
|
|
return false;
|
|
}
|
|
|
|
status = executable_->Freeze(nullptr);
|
|
if (status != HSA_STATUS_SUCCESS) {
|
|
LogError("Error: Freezing the executable failed.");
|
|
return false;
|
|
}
|
|
#endif
|
|
return true;
|
|
}
|
|
|
|
bool LightningProgram::setKernels(void* binary, size_t binSize,
|
|
amd::Os::FileDesc fdesc, size_t foffset, std::string uri) {
|
|
#if defined(USE_COMGR_LIBRARY)
|
|
// Collect the information about compiled binary
|
|
if (!isNull() && (palDevice().rgpCaptureMgr() != nullptr)) {
|
|
apiHash_ = palDevice().rgpCaptureMgr()->AddElfBinary(binary, binSize, binary, binSize,
|
|
codeSegGpu_->iMem(), codeSegGpu_->offset());
|
|
}
|
|
|
|
for (auto& kit : kernels()) {
|
|
LightningKernel* kernel = static_cast<LightningKernel*>(kit.second);
|
|
if (!kernel->postLoad()) {
|
|
return false;
|
|
}
|
|
// Find max scratch regs used in the program. It's used for scratch buffer preallocation
|
|
// with dynamic parallelism, since runtime doesn't know which child kernel will be called
|
|
maxScratchRegs_ =
|
|
std::max(static_cast<uint>(kernel->workGroupInfo()->scratchRegs_), maxScratchRegs_);
|
|
maxVgprs_ = std::max(static_cast<uint>(kernel->workGroupInfo()->usedVGPRs_), maxVgprs_);
|
|
}
|
|
DestroySegmentCpuAccess();
|
|
#endif // defined(USE_COMGR_LIBRARY)
|
|
return true;
|
|
}
|
|
|
|
} // namespace pal
|