P4 to Git Change 1330796 by gandryey@gera-w8 on 2016/10/24 15:12:41
SWDEV-86035 - Add PAL backend to OpenCL
- Use loader for the code objects loading and avoid allocation of each individual kernel
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/Makefile#17 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#13 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#13 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.hpp#10 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#33 edit
[ROCm/clr commit: 4e1f32bd71]
Этот коммит содержится в:
@@ -377,11 +377,11 @@ HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol *sym)
|
||||
if (!sym) {
|
||||
return false;
|
||||
}
|
||||
uint64_t akc_addr = 0;
|
||||
if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, reinterpret_cast<void*>(&akc_addr))) {
|
||||
if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, reinterpret_cast<void*>(&code_))) {
|
||||
return false;
|
||||
}
|
||||
amd_kernel_code_t *akc = reinterpret_cast<amd_kernel_code_t*>(akc_addr);
|
||||
|
||||
amd_kernel_code_t *akc = reinterpret_cast<amd_kernel_code_t*>(prog().findHostKernelAddress(code_));
|
||||
cpuAqlCode_ = akc;
|
||||
if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE, reinterpret_cast<void*>(&codeSize_))) {
|
||||
return false;
|
||||
@@ -390,22 +390,6 @@ HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol *sym)
|
||||
if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_ALIGN, reinterpret_cast<void*>(&akc_align))) {
|
||||
return false;
|
||||
}
|
||||
// Allocate HW resources for the real program only
|
||||
if (!prog().isNull()) {
|
||||
code_ = new Memory(dev(), amd::alignUp(codeSize_, akc_align));
|
||||
Resource::MemoryType type = Resource::Local;
|
||||
|
||||
// Initialize kernel ISA code
|
||||
if (code_ && code_->create(type)) {
|
||||
constexpr bool WaitForUpload = true;
|
||||
code_->writeRawData(*code_->dev().xferQueue(), 0, codeSize_,
|
||||
reinterpret_cast<void*>(akc), WaitForUpload);
|
||||
}
|
||||
else {
|
||||
LogError("Failed to allocate ISA code!");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
assert((akc->workitem_private_segment_byte_size & 3) == 0 &&
|
||||
"Scratch must be DWORD aligned");
|
||||
@@ -591,9 +575,8 @@ HSAILKernel::HSAILKernel(std::string name,
|
||||
, dev_(prog->dev())
|
||||
, prog_(*prog)
|
||||
, index_(0)
|
||||
, code_(nullptr)
|
||||
, code_(0)
|
||||
, codeSize_(0)
|
||||
, hwMetaData_(nullptr)
|
||||
, extraArgumentsNum_(extraArgsNum)
|
||||
, waveLimiter_(this, (prog->isNull() ? 1 :
|
||||
dev().properties().gfxipProperties.shaderCore.numCusPerShaderArray) * dev().hwInfo()->simdPerCU_)
|
||||
@@ -608,10 +591,6 @@ HSAILKernel::~HSAILKernel()
|
||||
delete arg;
|
||||
arguments_.pop_back();
|
||||
}
|
||||
|
||||
delete [] hwMetaData_;
|
||||
|
||||
delete code_;
|
||||
}
|
||||
|
||||
bool
|
||||
@@ -1217,7 +1196,7 @@ HSAILKernel::loadArguments(
|
||||
// Initialize kernel ISA and execution buffer requirements
|
||||
hsaDisp->private_segment_size = spillSegSize();
|
||||
hsaDisp->group_segment_size = ldsAddress - ldsSize();
|
||||
hsaDisp->kernel_object = gpuAqlCode()->vmAddress();
|
||||
hsaDisp->kernel_object = gpuAqlCode();
|
||||
|
||||
ConstBuffer* cb = gpu.constBufs_[0];
|
||||
cb->uploadDataToHw(argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t));
|
||||
@@ -1228,7 +1207,7 @@ HSAILKernel::loadArguments(
|
||||
hsaDisp->completion_signal.handle = 0;
|
||||
|
||||
memList.push_back(cb);
|
||||
memList.push_back(gpuAqlCode());
|
||||
memList.push_back(&prog().codeSegGpu());
|
||||
for (pal::Memory * mem : prog().globalStores()) {
|
||||
memList.push_back(mem);
|
||||
}
|
||||
|
||||
@@ -154,7 +154,7 @@ public:
|
||||
const amd_kernel_code_t* cpuAqlCode() const { return cpuAqlCode_; }
|
||||
|
||||
//! Returns memory object with AQL code
|
||||
pal::Memory* gpuAqlCode() const { return code_; }
|
||||
uint64_t gpuAqlCode() const { return code_; }
|
||||
|
||||
//! Returns size of AQL code
|
||||
size_t aqlCodeSize() const { return codeSize_; }
|
||||
@@ -249,12 +249,10 @@ protected:
|
||||
std::vector<PrintfInfo> printf_; //!< Format strings for GPU printf support
|
||||
uint index_; //!< Kernel index in the program
|
||||
|
||||
pal::Memory* code_; //!< Memory object with ISA code
|
||||
uint64_t code_; //!< GPU memory pointer to the kernel
|
||||
size_t codeSize_; //!< Size of ISA code
|
||||
|
||||
char* hwMetaData_; //!< SI metadata
|
||||
|
||||
uint extraArgumentsNum_; //! Number of extra (hidden) kernel arguments
|
||||
uint extraArgumentsNum_; //! Number of extra (hidden) kernel arguments
|
||||
|
||||
union Flags {
|
||||
struct {
|
||||
|
||||
@@ -34,6 +34,8 @@ HSAILProgram::HSAILProgram(Device& device)
|
||||
, binaryElf_(nullptr)
|
||||
, rawBinary_(nullptr)
|
||||
, kernels_(nullptr)
|
||||
, codeSegGpu_(nullptr)
|
||||
, codeSegCpu_(nullptr)
|
||||
, maxScratchRegs_(0)
|
||||
, flags_(0)
|
||||
, executable_(nullptr)
|
||||
@@ -54,6 +56,8 @@ HSAILProgram::HSAILProgram(NullDevice& device)
|
||||
, binaryElf_(nullptr)
|
||||
, rawBinary_(nullptr)
|
||||
, kernels_(nullptr)
|
||||
, codeSegGpu_(nullptr)
|
||||
, codeSegCpu_(nullptr)
|
||||
, maxScratchRegs_(0)
|
||||
, flags_(0)
|
||||
, executable_(nullptr)
|
||||
@@ -93,6 +97,8 @@ HSAILProgram::~HSAILProgram()
|
||||
}
|
||||
delete kernels_;
|
||||
amd::hsa::loader::Loader::Destroy(loader_);
|
||||
assert((codeSegGpu_ == nullptr) && "Loader didn't destroy code!");
|
||||
assert((codeSegCpu_ == nullptr) && "Loader didn't destroy code!");
|
||||
}
|
||||
|
||||
bool
|
||||
@@ -470,6 +476,9 @@ HSAILProgram::linkImpl(amd::option::Options* options)
|
||||
aclType continueCompileFrom = ACL_TYPE_LLVMIR_BINARY;
|
||||
bool finalize = true;
|
||||
bool hsaLoad = true;
|
||||
internal_ = (compileOptions_.find("-cl-internal-kernel") !=
|
||||
std::string::npos) ? true : false;
|
||||
|
||||
|
||||
// If !binaryElf_ then program must have been created using clCreateProgramWithBinary
|
||||
if (!binaryElf_) {
|
||||
@@ -545,6 +554,11 @@ HSAILProgram::linkImpl(amd::option::Options* options)
|
||||
buildLog_ += "Error: AMD HSA Code Object loading failed.\n";
|
||||
return false;
|
||||
}
|
||||
status = executable_->Freeze(nullptr);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
buildLog_ += "Error: AMD HSA Code Object freeze failed.\n";
|
||||
return false;
|
||||
}
|
||||
}
|
||||
size_t kernelNamesSize = 0;
|
||||
errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr, nullptr, &kernelNamesSize);
|
||||
@@ -687,8 +701,7 @@ HSAILProgram::allocKernelTable()
|
||||
kernels_->map(nullptr, pal::Resource::WriteOnly));
|
||||
for (auto& it : kernels()) {
|
||||
HSAILKernel* kernel = static_cast<HSAILKernel*>(it.second);
|
||||
table[kernel->index()] = static_cast<size_t>(
|
||||
kernel->gpuAqlCode()->vmAddress());
|
||||
table[kernel->index()] = static_cast<size_t>(kernel->gpuAqlCode());
|
||||
}
|
||||
kernels_->unmap(nullptr);
|
||||
}
|
||||
@@ -699,10 +712,7 @@ void
|
||||
HSAILProgram::fillResListWithKernels(
|
||||
std::vector<const Memory*>& memList) const
|
||||
{
|
||||
for (auto& it : kernels()) {
|
||||
memList.push_back(
|
||||
static_cast<HSAILKernel*>(it.second)->gpuAqlCode());
|
||||
}
|
||||
memList.push_back(&codeSegGpu());
|
||||
}
|
||||
|
||||
const aclTargetInfo &
|
||||
@@ -749,7 +759,7 @@ HSAILProgram::saveBinaryAndSetType(type_t type)
|
||||
return true;
|
||||
}
|
||||
|
||||
hsa_isa_t ORCAHSALoaderContext::IsaFromName(const char *name) {
|
||||
hsa_isa_t PALHSALoaderContext::IsaFromName(const char *name) {
|
||||
hsa_isa_t isa = {0};
|
||||
if (!strcmp(Gfx700, name)) { isa.handle = gfx700; return isa; }
|
||||
if (!strcmp(Gfx701, name)) { isa.handle = gfx701; return isa; }
|
||||
@@ -762,7 +772,7 @@ hsa_isa_t ORCAHSALoaderContext::IsaFromName(const char *name) {
|
||||
return isa;
|
||||
}
|
||||
|
||||
bool ORCAHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) {
|
||||
bool PALHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) {
|
||||
switch (program_->dev().hwInfo()->gfxipVersion_) {
|
||||
default:
|
||||
LogError("Unsupported gfxip version");
|
||||
@@ -785,7 +795,7 @@ bool ORCAHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa)
|
||||
}
|
||||
}
|
||||
|
||||
void* ORCAHSALoaderContext::SegmentAlloc(amdgpu_hsa_elf_segment_t segment,
|
||||
void* PALHSALoaderContext::SegmentAlloc(amdgpu_hsa_elf_segment_t segment,
|
||||
hsa_agent_t agent, size_t size, size_t align, bool zero) {
|
||||
assert(size);
|
||||
assert(align);
|
||||
@@ -795,13 +805,13 @@ void* ORCAHSALoaderContext::SegmentAlloc(amdgpu_hsa_elf_segment_t segment,
|
||||
case AMDGPU_HSA_SEGMENT_READONLY_AGENT:
|
||||
return AgentGlobalAlloc(agent, size, align, zero);
|
||||
case AMDGPU_HSA_SEGMENT_CODE_AGENT:
|
||||
return KernelCodeAlloc(agent, size, align, zero);
|
||||
return KernelCodeAlloc(size, align, zero);
|
||||
default:
|
||||
assert(false); return 0;
|
||||
}
|
||||
}
|
||||
|
||||
bool ORCAHSALoaderContext::SegmentCopy(amdgpu_hsa_elf_segment_t segment,
|
||||
bool PALHSALoaderContext::SegmentCopy(amdgpu_hsa_elf_segment_t segment,
|
||||
hsa_agent_t agent, void* dst, size_t offset, const void* src, size_t size) {
|
||||
switch (segment) {
|
||||
case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
|
||||
@@ -815,8 +825,9 @@ bool ORCAHSALoaderContext::SegmentCopy(amdgpu_hsa_elf_segment_t segment,
|
||||
}
|
||||
}
|
||||
|
||||
void ORCAHSALoaderContext::SegmentFree(amdgpu_hsa_elf_segment_t segment,
|
||||
hsa_agent_t agent, void* seg, size_t size) {
|
||||
void PALHSALoaderContext::SegmentFree(
|
||||
amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size)
|
||||
{
|
||||
switch (segment) {
|
||||
case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
|
||||
case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT:
|
||||
@@ -827,25 +838,72 @@ void ORCAHSALoaderContext::SegmentFree(amdgpu_hsa_elf_segment_t segment,
|
||||
}
|
||||
}
|
||||
|
||||
void* ORCAHSALoaderContext::SegmentAddress(amdgpu_hsa_elf_segment_t segment,
|
||||
hsa_agent_t agent, void* seg, size_t offset) {
|
||||
void* PALHSALoaderContext::SegmentAddress(
|
||||
amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t offset)
|
||||
{
|
||||
assert(seg);
|
||||
switch (segment) {
|
||||
case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
|
||||
case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT:
|
||||
case AMDGPU_HSA_SEGMENT_READONLY_AGENT: {
|
||||
case AMDGPU_HSA_SEGMENT_CODE_AGENT:
|
||||
if (!program_->isNull()) {
|
||||
pal::Memory *gpuMem = reinterpret_cast<pal::Memory*>(seg);
|
||||
return reinterpret_cast<void*>(gpuMem->vmAddress() + offset);
|
||||
}
|
||||
else {
|
||||
return reinterpret_cast<address>(seg) + offset;
|
||||
}
|
||||
}
|
||||
case AMDGPU_HSA_SEGMENT_CODE_AGENT: return (char*) seg + offset;
|
||||
default:
|
||||
assert(false); return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
hsa_status_t ORCAHSALoaderContext::SamplerCreate(
|
||||
void* PALHSALoaderContext::SegmentHostAddress(
|
||||
amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t offset)
|
||||
{
|
||||
void* host = nullptr;
|
||||
assert(seg);
|
||||
switch (segment) {
|
||||
case AMDGPU_HSA_SEGMENT_CODE_AGENT:
|
||||
host = program_->codeSegCpu() + offset;
|
||||
break;
|
||||
case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
|
||||
case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT:
|
||||
case AMDGPU_HSA_SEGMENT_READONLY_AGENT:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return host;
|
||||
}
|
||||
|
||||
bool PALHSALoaderContext::SegmentFreeze(
|
||||
amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size)
|
||||
{
|
||||
assert(seg);
|
||||
switch (segment) {
|
||||
case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
|
||||
case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT:
|
||||
case AMDGPU_HSA_SEGMENT_READONLY_AGENT:
|
||||
return true;
|
||||
case AMDGPU_HSA_SEGMENT_CODE_AGENT: {
|
||||
if (program_->isNull()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const pal::Memory& mem = program_->codeSegGpu();
|
||||
constexpr bool WaitForCopy = true;
|
||||
mem.writeRawData(*mem.dev().xferQueue(), 0, size, program_->codeSegCpu(), WaitForCopy);
|
||||
return true;
|
||||
}
|
||||
default:
|
||||
assert(false);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
hsa_status_t PALHSALoaderContext::SamplerCreate(
|
||||
hsa_agent_t agent,
|
||||
const hsa_ext_sampler_descriptor_t *sampler_descriptor,
|
||||
hsa_ext_sampler_t *sampler_handle)
|
||||
@@ -897,8 +955,9 @@ hsa_status_t ORCAHSALoaderContext::SamplerCreate(
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t ORCAHSALoaderContext::SamplerDestroy(
|
||||
hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) {
|
||||
hsa_status_t PALHSALoaderContext::SamplerDestroy(
|
||||
hsa_agent_t agent, hsa_ext_sampler_t sampler_handle)
|
||||
{
|
||||
if (!agent.handle) {
|
||||
return HSA_STATUS_ERROR_INVALID_AGENT;
|
||||
}
|
||||
@@ -908,7 +967,8 @@ hsa_status_t ORCAHSALoaderContext::SamplerDestroy(
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
void* ORCAHSALoaderContext::CpuMemAlloc(size_t size, size_t align, bool zero) {
|
||||
address PALHSALoaderContext::CpuMemAlloc(size_t size, size_t align, bool zero)
|
||||
{
|
||||
assert(size);
|
||||
assert(align);
|
||||
assert(sizeof(void*) == 8 || sizeof(void*) == 4);
|
||||
@@ -917,26 +977,21 @@ void* ORCAHSALoaderContext::CpuMemAlloc(size_t size, size_t align, bool zero) {
|
||||
if (zero) {
|
||||
memset(ptr, 0, size);
|
||||
}
|
||||
return ptr;
|
||||
return reinterpret_cast<address>(ptr);
|
||||
}
|
||||
|
||||
bool ORCAHSALoaderContext::CpuMemCopy(void *dst, size_t offset, const void* src, size_t size) {
|
||||
if (!dst || !src || dst == src) {
|
||||
return false;
|
||||
}
|
||||
if (0 == size) {
|
||||
return true;
|
||||
}
|
||||
amd::Os::fastMemcpy((char*)dst + offset, src, size);
|
||||
return true;
|
||||
bool PALHSALoaderContext::CpuMemCopy(void *dst, size_t offset, const void* src, size_t size)
|
||||
{
|
||||
amd::Os::fastMemcpy((char*)dst + offset, src, size);
|
||||
return true;
|
||||
}
|
||||
|
||||
void* ORCAHSALoaderContext::GpuMemAlloc(size_t size, size_t align, bool zero) {
|
||||
void* PALHSALoaderContext::GpuMemAlloc(size_t size, size_t align, bool zero) {
|
||||
assert(size);
|
||||
assert(align);
|
||||
assert(sizeof(void*) == 8 || sizeof(void*) == 4);
|
||||
if (program_->isNull()) {
|
||||
return new char[size];
|
||||
return CpuMemAlloc(size, align, zero);
|
||||
}
|
||||
|
||||
pal::Memory* mem = new pal::Memory(program_->dev(), amd::alignUp(size, align));
|
||||
@@ -945,7 +1000,7 @@ void* ORCAHSALoaderContext::GpuMemAlloc(size_t size, size_t align, bool zero) {
|
||||
return nullptr;
|
||||
}
|
||||
assert(program_->dev().xferQueue());
|
||||
if (zero) {
|
||||
if (zero && !program_->isInternal()) {
|
||||
char pattern = 0;
|
||||
program_->dev().xferMgr().fillBuffer(*mem, &pattern, sizeof(pattern), amd::Coord3D(0), amd::Coord3D(size));
|
||||
}
|
||||
@@ -954,7 +1009,7 @@ void* ORCAHSALoaderContext::GpuMemAlloc(size_t size, size_t align, bool zero) {
|
||||
return mem;
|
||||
}
|
||||
|
||||
bool ORCAHSALoaderContext::GpuMemCopy(void *dst, size_t offset, const void *src, size_t size) {
|
||||
bool PALHSALoaderContext::GpuMemCopy(void *dst, size_t offset, const void *src, size_t size) {
|
||||
if (!dst || !src || dst == src) {
|
||||
return false;
|
||||
}
|
||||
@@ -962,7 +1017,7 @@ bool ORCAHSALoaderContext::GpuMemCopy(void *dst, size_t offset, const void *src,
|
||||
return true;
|
||||
}
|
||||
if (program_->isNull()) {
|
||||
memcpy(reinterpret_cast<address>(dst) + offset, src, size);
|
||||
CpuMemCopy(dst, offset, src, size);
|
||||
return true;
|
||||
}
|
||||
assert(program_->dev().xferQueue());
|
||||
@@ -972,16 +1027,62 @@ bool ORCAHSALoaderContext::GpuMemCopy(void *dst, size_t offset, const void *src,
|
||||
return true;
|
||||
}
|
||||
|
||||
void ORCAHSALoaderContext::GpuMemFree(void *ptr, size_t size)
|
||||
void PALHSALoaderContext::GpuMemFree(void *ptr, size_t size)
|
||||
{
|
||||
if (program_->isNull()) {
|
||||
delete[] reinterpret_cast<char*>(ptr);
|
||||
CpuMemFree(ptr, size);
|
||||
}
|
||||
else {
|
||||
delete reinterpret_cast<pal::Memory*>(ptr);
|
||||
}
|
||||
}
|
||||
|
||||
void* PALHSALoaderContext::KernelCodeAlloc(
|
||||
size_t size, size_t align, bool zero)
|
||||
{
|
||||
address host = CpuMemAlloc(size, align, zero);
|
||||
pal::Memory* mem = nullptr;
|
||||
|
||||
if (!program_->isNull()) {
|
||||
mem = new pal::Memory(program_->dev(), amd::alignUp(size, align));
|
||||
if (!mem || !mem->create(pal::Resource::Local)) {
|
||||
delete mem;
|
||||
mem = nullptr;
|
||||
}
|
||||
}
|
||||
program_->setCodeObjects(mem, host);
|
||||
return ((host == nullptr || mem == nullptr) ? nullptr : mem);
|
||||
}
|
||||
|
||||
bool PALHSALoaderContext::KernelCodeCopy(void *dst, size_t offset, const void *src, size_t size)
|
||||
{
|
||||
if (!dst || !src || dst == src) {
|
||||
return false;
|
||||
}
|
||||
if (0 == size) {
|
||||
return true;
|
||||
}
|
||||
if (program_->isNull()) {
|
||||
return CpuMemCopy(dst, offset, src, size);
|
||||
}
|
||||
assert(program_->dev().xferQueue());
|
||||
pal::Memory* mem = reinterpret_cast<pal::Memory*>(dst);
|
||||
if (mem == &program_->codeSegGpu()) {
|
||||
return CpuMemCopy(program_->codeSegCpu(), offset, src, size);
|
||||
}
|
||||
assert(!"The segement doesn't match code segment in the program!");
|
||||
return false;
|
||||
}
|
||||
|
||||
void PALHSALoaderContext::KernelCodeFree(void *ptr, size_t size)
|
||||
{
|
||||
CpuMemFree(program_->codeSegCpu(), size);
|
||||
if (!program_->isNull()) {
|
||||
delete reinterpret_cast<pal::Memory*>(ptr);
|
||||
}
|
||||
program_->setCodeObjects(nullptr, nullptr);
|
||||
}
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
|
||||
static hsa_status_t
|
||||
@@ -1019,6 +1120,8 @@ bool
|
||||
LightningProgram::linkImpl(amd::option::Options *options)
|
||||
{
|
||||
using namespace amd::opencl_driver;
|
||||
internal_ = (compileOptions_.find("-cl-internal-kernel") !=
|
||||
std::string::npos) ? true : false;
|
||||
|
||||
aclType continueCompileFrom = llvmBinary_.empty()
|
||||
? getNextCompilationStageFromBinary(options)
|
||||
@@ -1270,12 +1373,11 @@ LightningProgram::setKernels(
|
||||
return false;
|
||||
}
|
||||
|
||||
/* FIXME_lmoriche: We need to call this!
|
||||
status = executable_->Freeze(nullptr);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
buildLog_ += "Error: Freezing the executable failed: ";
|
||||
return false;
|
||||
}*/
|
||||
}
|
||||
|
||||
size_t progvarsTotalSize = 0;
|
||||
|
||||
|
||||
@@ -34,11 +34,11 @@ namespace pal {
|
||||
using namespace amd::hsa::loader;
|
||||
class HSAILProgram;
|
||||
|
||||
class ORCAHSALoaderContext final: public Context {
|
||||
class PALHSALoaderContext final: public Context {
|
||||
public:
|
||||
ORCAHSALoaderContext(HSAILProgram* program): program_(program) {}
|
||||
PALHSALoaderContext(HSAILProgram* program): program_(program) {}
|
||||
|
||||
virtual ~ORCAHSALoaderContext() {}
|
||||
virtual ~PALHSALoaderContext() {}
|
||||
|
||||
hsa_isa_t IsaFromName(const char *name) override;
|
||||
|
||||
@@ -58,12 +58,10 @@ public:
|
||||
hsa_agent_t agent, void* seg, size_t offset) override;
|
||||
|
||||
void* SegmentHostAddress(amdgpu_hsa_elf_segment_t segment,
|
||||
hsa_agent_t agent, void* seg, size_t offset) override {
|
||||
return nullptr;
|
||||
}
|
||||
hsa_agent_t agent, void* seg, size_t offset) override;
|
||||
|
||||
bool SegmentFreeze(amdgpu_hsa_elf_segment_t segment,
|
||||
hsa_agent_t agent, void* seg, size_t size) override { return false; }
|
||||
hsa_agent_t agent, void* seg, size_t size) override;
|
||||
|
||||
bool ImageExtensionSupported() override { return false; }
|
||||
|
||||
@@ -109,20 +107,13 @@ private:
|
||||
GpuMemFree(ptr, size);
|
||||
}
|
||||
|
||||
void* KernelCodeAlloc(
|
||||
hsa_agent_t agent, size_t size, size_t align, bool zero) {
|
||||
return CpuMemAlloc(size, align, zero);
|
||||
}
|
||||
void* KernelCodeAlloc(size_t size, size_t align, bool zero);
|
||||
|
||||
bool KernelCodeCopy(void *dst, size_t offset, const void *src, size_t size) {
|
||||
return CpuMemCopy(dst, offset, src, size);
|
||||
}
|
||||
bool KernelCodeCopy(void *dst, size_t offset, const void *src, size_t size);
|
||||
|
||||
void KernelCodeFree(void *ptr, size_t size) {
|
||||
CpuMemFree(ptr, size);
|
||||
}
|
||||
void KernelCodeFree(void *ptr, size_t size);
|
||||
|
||||
void* CpuMemAlloc(size_t size, size_t align, bool zero);
|
||||
address CpuMemAlloc(size_t size, size_t align, bool zero);
|
||||
|
||||
bool CpuMemCopy(void *dst, size_t offset, const void* src, size_t size);
|
||||
|
||||
@@ -136,9 +127,9 @@ private:
|
||||
|
||||
void GpuMemFree(void *ptr, size_t size = 0);
|
||||
|
||||
ORCAHSALoaderContext(const ORCAHSALoaderContext &c);
|
||||
PALHSALoaderContext(const PALHSALoaderContext &c);
|
||||
|
||||
ORCAHSALoaderContext& operator=(const ORCAHSALoaderContext &c);
|
||||
PALHSALoaderContext& operator=(const PALHSALoaderContext &c);
|
||||
|
||||
pal::HSAILProgram* program_;
|
||||
};
|
||||
@@ -160,6 +151,9 @@ public:
|
||||
|
||||
void addGlobalStore(Memory* mem) { globalStores_.push_back(mem); }
|
||||
|
||||
void setCodeObjects(Memory* codeGpu, address codeCpu)
|
||||
{ codeSegGpu_ = codeGpu; codeSegCpu_ = codeCpu; }
|
||||
|
||||
const std::vector<Memory*>& globalStores() const { return globalStores_; }
|
||||
|
||||
//! Return a typecasted GPU device
|
||||
@@ -182,9 +176,24 @@ public:
|
||||
//! Returns TRUE if the program just compiled
|
||||
bool isNull() const { return isNull_; }
|
||||
|
||||
//! Returns TRUE if the program used internally by runtime
|
||||
bool isInternal() const { return internal_; }
|
||||
|
||||
//! Returns TRUE if the program contains static samplers
|
||||
bool isStaticSampler() const { return (staticSamplers_.size() != 0); }
|
||||
|
||||
//! Returns code segement on GPU
|
||||
const Memory& codeSegGpu() const { return *codeSegGpu_; }
|
||||
|
||||
//! Returns code segement on CPU
|
||||
address codeSegCpu() const { return codeSegCpu_; }
|
||||
|
||||
//! Returns CPU address for a kernel
|
||||
uint64_t findHostKernelAddress(uint64_t devAddr) const
|
||||
{
|
||||
return loader_->FindHostAddress(devAddr);
|
||||
}
|
||||
|
||||
protected:
|
||||
//! pre-compile setup for GPU
|
||||
virtual bool initBuild(amd::option::Options* options);
|
||||
@@ -270,17 +279,20 @@ protected:
|
||||
aclBinaryOptions binOpts_; //!< Binary options to create aclBinary
|
||||
std::vector<Memory*> globalStores_; //!< Global memory for the program
|
||||
Memory* kernels_; //!< Table with kernel object pointers
|
||||
Memory* codeSegGpu_; //!< GPU memory with code objects
|
||||
address codeSegCpu_; //!< CPU memory with code objects
|
||||
uint maxScratchRegs_; //!< Maximum number of scratch regs used in the program by individual kernel
|
||||
std::list<Sampler*> staticSamplers_; //!< List od internal static samplers
|
||||
union {
|
||||
struct {
|
||||
uint32_t isNull_ : 1; //!< Null program no memory allocations
|
||||
uint32_t internal_ : 1; //!< Internal blit program
|
||||
};
|
||||
uint32_t flags_; //!< Program flags
|
||||
};
|
||||
amd::hsa::loader::Loader* loader_; //!< Loader object
|
||||
amd::hsa::loader::Executable* executable_; //!< Executable for HSA Loader
|
||||
ORCAHSALoaderContext loaderContext_; //!< Context for HSA Loader
|
||||
PALHSALoaderContext loaderContext_; //!< Context for HSA Loader
|
||||
};
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
|
||||
@@ -3300,7 +3300,7 @@ VirtualGPU::buildKernelInfo(const HSAILKernel& hsaKernel,
|
||||
kernelInfo.pAqlQueuePtr = reinterpret_cast<void*>(hsaQueueMem_->vmAddress());
|
||||
|
||||
// Get the address of the kernel code and its size for CPU access
|
||||
pal::Memory* aqlCode = hsaKernel.gpuAqlCode();
|
||||
/* pal::Memory* aqlCode = hsaKernel.gpuAqlCode();
|
||||
if (nullptr != aqlCode) {
|
||||
address aqlCodeAddr = static_cast<address>(aqlCode->map(nullptr, 0));
|
||||
dbgManager->setKernelCodeInfo(aqlCodeAddr, hsaKernel.aqlCodeSize());
|
||||
@@ -3309,7 +3309,7 @@ VirtualGPU::buildKernelInfo(const HSAILKernel& hsaKernel,
|
||||
else {
|
||||
dbgManager->setKernelCodeInfo(nullptr, 0);
|
||||
}
|
||||
|
||||
*/
|
||||
kernelInfo.trapPresent = false;
|
||||
kernelInfo.trapHandler = nullptr;
|
||||
kernelInfo.trapHandlerBuffer = nullptr;
|
||||
|
||||
Ссылка в новой задаче
Block a user