From bc18cf7a72cecdfe9783dd070f074bfc701acf6b Mon Sep 17 00:00:00 2001
From: foreman
Date: Mon, 24 Oct 2016 15:25:54 -0400
Subject: [PATCH] P4 to Git Change 1330796 by gandryey@gera-w8 on 2016/10/24
15:12:41
SWDEV-86035 - Add PAL backend to OpenCL
- Use loader for the code objects loading and avoid allocation of each individual kernel
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/Makefile#17 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#13 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#13 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.hpp#10 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#33 edit
[ROCm/clr commit: 4e1f32bd714cdd631a122668b1702f142fffe476]
---
.../rocclr/runtime/device/pal/palkernel.cpp | 33 +---
.../rocclr/runtime/device/pal/palkernel.hpp | 8 +-
.../rocclr/runtime/device/pal/palprogram.cpp | 180 ++++++++++++++----
.../rocclr/runtime/device/pal/palprogram.hpp | 54 ++++--
.../rocclr/runtime/device/pal/palvirtual.cpp | 4 +-
5 files changed, 185 insertions(+), 94 deletions(-)
diff --git a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp
index b639715f00..e45a4ef514 100644
--- a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp
@@ -377,11 +377,11 @@ HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol *sym)
if (!sym) {
return false;
}
- uint64_t akc_addr = 0;
- if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, reinterpret_cast(&akc_addr))) {
+ if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, reinterpret_cast(&code_))) {
return false;
}
- amd_kernel_code_t *akc = reinterpret_cast(akc_addr);
+
+ amd_kernel_code_t *akc = reinterpret_cast(prog().findHostKernelAddress(code_));
cpuAqlCode_ = akc;
if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE, reinterpret_cast(&codeSize_))) {
return false;
@@ -390,22 +390,6 @@ HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol *sym)
if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_ALIGN, reinterpret_cast(&akc_align))) {
return false;
}
- // Allocate HW resources for the real program only
- if (!prog().isNull()) {
- code_ = new Memory(dev(), amd::alignUp(codeSize_, akc_align));
- Resource::MemoryType type = Resource::Local;
-
- // Initialize kernel ISA code
- if (code_ && code_->create(type)) {
- constexpr bool WaitForUpload = true;
- code_->writeRawData(*code_->dev().xferQueue(), 0, codeSize_,
- reinterpret_cast(akc), WaitForUpload);
- }
- else {
- LogError("Failed to allocate ISA code!");
- return false;
- }
- }
assert((akc->workitem_private_segment_byte_size & 3) == 0 &&
"Scratch must be DWORD aligned");
@@ -591,9 +575,8 @@ HSAILKernel::HSAILKernel(std::string name,
, dev_(prog->dev())
, prog_(*prog)
, index_(0)
- , code_(nullptr)
+ , code_(0)
, codeSize_(0)
- , hwMetaData_(nullptr)
, extraArgumentsNum_(extraArgsNum)
, waveLimiter_(this, (prog->isNull() ? 1 :
dev().properties().gfxipProperties.shaderCore.numCusPerShaderArray) * dev().hwInfo()->simdPerCU_)
@@ -608,10 +591,6 @@ HSAILKernel::~HSAILKernel()
delete arg;
arguments_.pop_back();
}
-
- delete [] hwMetaData_;
-
- delete code_;
}
bool
@@ -1217,7 +1196,7 @@ HSAILKernel::loadArguments(
// Initialize kernel ISA and execution buffer requirements
hsaDisp->private_segment_size = spillSegSize();
hsaDisp->group_segment_size = ldsAddress - ldsSize();
- hsaDisp->kernel_object = gpuAqlCode()->vmAddress();
+ hsaDisp->kernel_object = gpuAqlCode();
ConstBuffer* cb = gpu.constBufs_[0];
cb->uploadDataToHw(argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t));
@@ -1228,7 +1207,7 @@ HSAILKernel::loadArguments(
hsaDisp->completion_signal.handle = 0;
memList.push_back(cb);
- memList.push_back(gpuAqlCode());
+ memList.push_back(&prog().codeSegGpu());
for (pal::Memory * mem : prog().globalStores()) {
memList.push_back(mem);
}
diff --git a/projects/clr/rocclr/runtime/device/pal/palkernel.hpp b/projects/clr/rocclr/runtime/device/pal/palkernel.hpp
index c4fc7c0c74..581ea3adae 100644
--- a/projects/clr/rocclr/runtime/device/pal/palkernel.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palkernel.hpp
@@ -154,7 +154,7 @@ public:
const amd_kernel_code_t* cpuAqlCode() const { return cpuAqlCode_; }
//! Returns memory object with AQL code
- pal::Memory* gpuAqlCode() const { return code_; }
+ uint64_t gpuAqlCode() const { return code_; }
//! Returns size of AQL code
size_t aqlCodeSize() const { return codeSize_; }
@@ -249,12 +249,10 @@ protected:
std::vector printf_; //!< Format strings for GPU printf support
uint index_; //!< Kernel index in the program
- pal::Memory* code_; //!< Memory object with ISA code
+ uint64_t code_; //!< GPU memory pointer to the kernel
size_t codeSize_; //!< Size of ISA code
- char* hwMetaData_; //!< SI metadata
-
- uint extraArgumentsNum_; //! Number of extra (hidden) kernel arguments
+ uint extraArgumentsNum_; //! Number of extra (hidden) kernel arguments
union Flags {
struct {
diff --git a/projects/clr/rocclr/runtime/device/pal/palprogram.cpp b/projects/clr/rocclr/runtime/device/pal/palprogram.cpp
index 8fe8796525..dd564702bc 100644
--- a/projects/clr/rocclr/runtime/device/pal/palprogram.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palprogram.cpp
@@ -34,6 +34,8 @@ HSAILProgram::HSAILProgram(Device& device)
, binaryElf_(nullptr)
, rawBinary_(nullptr)
, kernels_(nullptr)
+ , codeSegGpu_(nullptr)
+ , codeSegCpu_(nullptr)
, maxScratchRegs_(0)
, flags_(0)
, executable_(nullptr)
@@ -54,6 +56,8 @@ HSAILProgram::HSAILProgram(NullDevice& device)
, binaryElf_(nullptr)
, rawBinary_(nullptr)
, kernels_(nullptr)
+ , codeSegGpu_(nullptr)
+ , codeSegCpu_(nullptr)
, maxScratchRegs_(0)
, flags_(0)
, executable_(nullptr)
@@ -93,6 +97,8 @@ HSAILProgram::~HSAILProgram()
}
delete kernels_;
amd::hsa::loader::Loader::Destroy(loader_);
+ assert((codeSegGpu_ == nullptr) && "Loader didn't destroy code!");
+ assert((codeSegCpu_ == nullptr) && "Loader didn't destroy code!");
}
bool
@@ -470,6 +476,9 @@ HSAILProgram::linkImpl(amd::option::Options* options)
aclType continueCompileFrom = ACL_TYPE_LLVMIR_BINARY;
bool finalize = true;
bool hsaLoad = true;
+ internal_ = (compileOptions_.find("-cl-internal-kernel") !=
+ std::string::npos) ? true : false;
+
// If !binaryElf_ then program must have been created using clCreateProgramWithBinary
if (!binaryElf_) {
@@ -545,6 +554,11 @@ HSAILProgram::linkImpl(amd::option::Options* options)
buildLog_ += "Error: AMD HSA Code Object loading failed.\n";
return false;
}
+ status = executable_->Freeze(nullptr);
+ if (status != HSA_STATUS_SUCCESS) {
+ buildLog_ += "Error: AMD HSA Code Object freeze failed.\n";
+ return false;
+ }
}
size_t kernelNamesSize = 0;
errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr, nullptr, &kernelNamesSize);
@@ -687,8 +701,7 @@ HSAILProgram::allocKernelTable()
kernels_->map(nullptr, pal::Resource::WriteOnly));
for (auto& it : kernels()) {
HSAILKernel* kernel = static_cast(it.second);
- table[kernel->index()] = static_cast(
- kernel->gpuAqlCode()->vmAddress());
+ table[kernel->index()] = static_cast(kernel->gpuAqlCode());
}
kernels_->unmap(nullptr);
}
@@ -699,10 +712,7 @@ void
HSAILProgram::fillResListWithKernels(
std::vector& memList) const
{
- for (auto& it : kernels()) {
- memList.push_back(
- static_cast(it.second)->gpuAqlCode());
- }
+ memList.push_back(&codeSegGpu());
}
const aclTargetInfo &
@@ -749,7 +759,7 @@ HSAILProgram::saveBinaryAndSetType(type_t type)
return true;
}
-hsa_isa_t ORCAHSALoaderContext::IsaFromName(const char *name) {
+hsa_isa_t PALHSALoaderContext::IsaFromName(const char *name) {
hsa_isa_t isa = {0};
if (!strcmp(Gfx700, name)) { isa.handle = gfx700; return isa; }
if (!strcmp(Gfx701, name)) { isa.handle = gfx701; return isa; }
@@ -762,7 +772,7 @@ hsa_isa_t ORCAHSALoaderContext::IsaFromName(const char *name) {
return isa;
}
-bool ORCAHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) {
+bool PALHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) {
switch (program_->dev().hwInfo()->gfxipVersion_) {
default:
LogError("Unsupported gfxip version");
@@ -785,7 +795,7 @@ bool ORCAHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa)
}
}
-void* ORCAHSALoaderContext::SegmentAlloc(amdgpu_hsa_elf_segment_t segment,
+void* PALHSALoaderContext::SegmentAlloc(amdgpu_hsa_elf_segment_t segment,
hsa_agent_t agent, size_t size, size_t align, bool zero) {
assert(size);
assert(align);
@@ -795,13 +805,13 @@ void* ORCAHSALoaderContext::SegmentAlloc(amdgpu_hsa_elf_segment_t segment,
case AMDGPU_HSA_SEGMENT_READONLY_AGENT:
return AgentGlobalAlloc(agent, size, align, zero);
case AMDGPU_HSA_SEGMENT_CODE_AGENT:
- return KernelCodeAlloc(agent, size, align, zero);
+ return KernelCodeAlloc(size, align, zero);
default:
assert(false); return 0;
}
}
-bool ORCAHSALoaderContext::SegmentCopy(amdgpu_hsa_elf_segment_t segment,
+bool PALHSALoaderContext::SegmentCopy(amdgpu_hsa_elf_segment_t segment,
hsa_agent_t agent, void* dst, size_t offset, const void* src, size_t size) {
switch (segment) {
case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
@@ -815,8 +825,9 @@ bool ORCAHSALoaderContext::SegmentCopy(amdgpu_hsa_elf_segment_t segment,
}
}
-void ORCAHSALoaderContext::SegmentFree(amdgpu_hsa_elf_segment_t segment,
- hsa_agent_t agent, void* seg, size_t size) {
+void PALHSALoaderContext::SegmentFree(
+ amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size)
+{
switch (segment) {
case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT:
@@ -827,25 +838,72 @@ void ORCAHSALoaderContext::SegmentFree(amdgpu_hsa_elf_segment_t segment,
}
}
-void* ORCAHSALoaderContext::SegmentAddress(amdgpu_hsa_elf_segment_t segment,
- hsa_agent_t agent, void* seg, size_t offset) {
+void* PALHSALoaderContext::SegmentAddress(
+ amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t offset)
+{
assert(seg);
switch (segment) {
case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT:
case AMDGPU_HSA_SEGMENT_READONLY_AGENT: {
+ case AMDGPU_HSA_SEGMENT_CODE_AGENT:
if (!program_->isNull()) {
pal::Memory *gpuMem = reinterpret_cast(seg);
return reinterpret_cast(gpuMem->vmAddress() + offset);
}
+ else {
+ return reinterpret_cast(seg) + offset;
+ }
}
- case AMDGPU_HSA_SEGMENT_CODE_AGENT: return (char*) seg + offset;
default:
assert(false); return nullptr;
}
}
-hsa_status_t ORCAHSALoaderContext::SamplerCreate(
+void* PALHSALoaderContext::SegmentHostAddress(
+ amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t offset)
+{
+ void* host = nullptr;
+ assert(seg);
+ switch (segment) {
+ case AMDGPU_HSA_SEGMENT_CODE_AGENT:
+ host = program_->codeSegCpu() + offset;
+ break;
+ case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
+ case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT:
+ case AMDGPU_HSA_SEGMENT_READONLY_AGENT:
+ default:
+ break;
+ }
+ return host;
+}
+
+bool PALHSALoaderContext::SegmentFreeze(
+ amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size)
+{
+ assert(seg);
+ switch (segment) {
+ case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
+ case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT:
+ case AMDGPU_HSA_SEGMENT_READONLY_AGENT:
+ return true;
+ case AMDGPU_HSA_SEGMENT_CODE_AGENT: {
+ if (program_->isNull()) {
+ return true;
+ }
+
+ const pal::Memory& mem = program_->codeSegGpu();
+ constexpr bool WaitForCopy = true;
+ mem.writeRawData(*mem.dev().xferQueue(), 0, size, program_->codeSegCpu(), WaitForCopy);
+ return true;
+ }
+ default:
+ assert(false);
+ return false;
+ }
+}
+
+hsa_status_t PALHSALoaderContext::SamplerCreate(
hsa_agent_t agent,
const hsa_ext_sampler_descriptor_t *sampler_descriptor,
hsa_ext_sampler_t *sampler_handle)
@@ -897,8 +955,9 @@ hsa_status_t ORCAHSALoaderContext::SamplerCreate(
return HSA_STATUS_SUCCESS;
}
-hsa_status_t ORCAHSALoaderContext::SamplerDestroy(
- hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) {
+hsa_status_t PALHSALoaderContext::SamplerDestroy(
+ hsa_agent_t agent, hsa_ext_sampler_t sampler_handle)
+{
if (!agent.handle) {
return HSA_STATUS_ERROR_INVALID_AGENT;
}
@@ -908,7 +967,8 @@ hsa_status_t ORCAHSALoaderContext::SamplerDestroy(
return HSA_STATUS_SUCCESS;
}
-void* ORCAHSALoaderContext::CpuMemAlloc(size_t size, size_t align, bool zero) {
+address PALHSALoaderContext::CpuMemAlloc(size_t size, size_t align, bool zero)
+{
assert(size);
assert(align);
assert(sizeof(void*) == 8 || sizeof(void*) == 4);
@@ -917,26 +977,21 @@ void* ORCAHSALoaderContext::CpuMemAlloc(size_t size, size_t align, bool zero) {
if (zero) {
memset(ptr, 0, size);
}
- return ptr;
+ return reinterpret_cast(ptr);
}
-bool ORCAHSALoaderContext::CpuMemCopy(void *dst, size_t offset, const void* src, size_t size) {
- if (!dst || !src || dst == src) {
- return false;
- }
- if (0 == size) {
- return true;
- }
- amd::Os::fastMemcpy((char*)dst + offset, src, size);
- return true;
+bool PALHSALoaderContext::CpuMemCopy(void *dst, size_t offset, const void* src, size_t size)
+{
+ amd::Os::fastMemcpy((char*)dst + offset, src, size);
+ return true;
}
-void* ORCAHSALoaderContext::GpuMemAlloc(size_t size, size_t align, bool zero) {
+void* PALHSALoaderContext::GpuMemAlloc(size_t size, size_t align, bool zero) {
assert(size);
assert(align);
assert(sizeof(void*) == 8 || sizeof(void*) == 4);
if (program_->isNull()) {
- return new char[size];
+ return CpuMemAlloc(size, align, zero);
}
pal::Memory* mem = new pal::Memory(program_->dev(), amd::alignUp(size, align));
@@ -945,7 +1000,7 @@ void* ORCAHSALoaderContext::GpuMemAlloc(size_t size, size_t align, bool zero) {
return nullptr;
}
assert(program_->dev().xferQueue());
- if (zero) {
+ if (zero && !program_->isInternal()) {
char pattern = 0;
program_->dev().xferMgr().fillBuffer(*mem, &pattern, sizeof(pattern), amd::Coord3D(0), amd::Coord3D(size));
}
@@ -954,7 +1009,7 @@ void* ORCAHSALoaderContext::GpuMemAlloc(size_t size, size_t align, bool zero) {
return mem;
}
-bool ORCAHSALoaderContext::GpuMemCopy(void *dst, size_t offset, const void *src, size_t size) {
+bool PALHSALoaderContext::GpuMemCopy(void *dst, size_t offset, const void *src, size_t size) {
if (!dst || !src || dst == src) {
return false;
}
@@ -962,7 +1017,7 @@ bool ORCAHSALoaderContext::GpuMemCopy(void *dst, size_t offset, const void *src,
return true;
}
if (program_->isNull()) {
- memcpy(reinterpret_cast(dst) + offset, src, size);
+ CpuMemCopy(dst, offset, src, size);
return true;
}
assert(program_->dev().xferQueue());
@@ -972,16 +1027,62 @@ bool ORCAHSALoaderContext::GpuMemCopy(void *dst, size_t offset, const void *src,
return true;
}
-void ORCAHSALoaderContext::GpuMemFree(void *ptr, size_t size)
+void PALHSALoaderContext::GpuMemFree(void *ptr, size_t size)
{
if (program_->isNull()) {
- delete[] reinterpret_cast(ptr);
+ CpuMemFree(ptr, size);
}
else {
delete reinterpret_cast(ptr);
}
}
+void* PALHSALoaderContext::KernelCodeAlloc(
+ size_t size, size_t align, bool zero)
+{
+ address host = CpuMemAlloc(size, align, zero);
+ pal::Memory* mem = nullptr;
+
+ if (!program_->isNull()) {
+ mem = new pal::Memory(program_->dev(), amd::alignUp(size, align));
+ if (!mem || !mem->create(pal::Resource::Local)) {
+ delete mem;
+ mem = nullptr;
+ }
+ }
+ program_->setCodeObjects(mem, host);
+ return ((host == nullptr || mem == nullptr) ? nullptr : mem);
+}
+
+bool PALHSALoaderContext::KernelCodeCopy(void *dst, size_t offset, const void *src, size_t size)
+{
+ if (!dst || !src || dst == src) {
+ return false;
+ }
+ if (0 == size) {
+ return true;
+ }
+ if (program_->isNull()) {
+ return CpuMemCopy(dst, offset, src, size);
+ }
+ assert(program_->dev().xferQueue());
+ pal::Memory* mem = reinterpret_cast(dst);
+ if (mem == &program_->codeSegGpu()) {
+ return CpuMemCopy(program_->codeSegCpu(), offset, src, size);
+ }
+ assert(!"The segement doesn't match code segment in the program!");
+ return false;
+}
+
+void PALHSALoaderContext::KernelCodeFree(void *ptr, size_t size)
+{
+ CpuMemFree(program_->codeSegCpu(), size);
+ if (!program_->isNull()) {
+ delete reinterpret_cast(ptr);
+ }
+ program_->setCodeObjects(nullptr, nullptr);
+}
+
#if defined(WITH_LIGHTNING_COMPILER)
static hsa_status_t
@@ -1019,6 +1120,8 @@ bool
LightningProgram::linkImpl(amd::option::Options *options)
{
using namespace amd::opencl_driver;
+ internal_ = (compileOptions_.find("-cl-internal-kernel") !=
+ std::string::npos) ? true : false;
aclType continueCompileFrom = llvmBinary_.empty()
? getNextCompilationStageFromBinary(options)
@@ -1270,12 +1373,11 @@ LightningProgram::setKernels(
return false;
}
- /* FIXME_lmoriche: We need to call this!
status = executable_->Freeze(nullptr);
if (status != HSA_STATUS_SUCCESS) {
buildLog_ += "Error: Freezing the executable failed: ";
return false;
- }*/
+ }
size_t progvarsTotalSize = 0;
diff --git a/projects/clr/rocclr/runtime/device/pal/palprogram.hpp b/projects/clr/rocclr/runtime/device/pal/palprogram.hpp
index f1b58cec56..83531961f4 100644
--- a/projects/clr/rocclr/runtime/device/pal/palprogram.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palprogram.hpp
@@ -34,11 +34,11 @@ namespace pal {
using namespace amd::hsa::loader;
class HSAILProgram;
-class ORCAHSALoaderContext final: public Context {
+class PALHSALoaderContext final: public Context {
public:
- ORCAHSALoaderContext(HSAILProgram* program): program_(program) {}
+ PALHSALoaderContext(HSAILProgram* program): program_(program) {}
- virtual ~ORCAHSALoaderContext() {}
+ virtual ~PALHSALoaderContext() {}
hsa_isa_t IsaFromName(const char *name) override;
@@ -58,12 +58,10 @@ public:
hsa_agent_t agent, void* seg, size_t offset) override;
void* SegmentHostAddress(amdgpu_hsa_elf_segment_t segment,
- hsa_agent_t agent, void* seg, size_t offset) override {
- return nullptr;
- }
+ hsa_agent_t agent, void* seg, size_t offset) override;
bool SegmentFreeze(amdgpu_hsa_elf_segment_t segment,
- hsa_agent_t agent, void* seg, size_t size) override { return false; }
+ hsa_agent_t agent, void* seg, size_t size) override;
bool ImageExtensionSupported() override { return false; }
@@ -109,20 +107,13 @@ private:
GpuMemFree(ptr, size);
}
- void* KernelCodeAlloc(
- hsa_agent_t agent, size_t size, size_t align, bool zero) {
- return CpuMemAlloc(size, align, zero);
- }
+ void* KernelCodeAlloc(size_t size, size_t align, bool zero);
- bool KernelCodeCopy(void *dst, size_t offset, const void *src, size_t size) {
- return CpuMemCopy(dst, offset, src, size);
- }
+ bool KernelCodeCopy(void *dst, size_t offset, const void *src, size_t size);
- void KernelCodeFree(void *ptr, size_t size) {
- CpuMemFree(ptr, size);
- }
+ void KernelCodeFree(void *ptr, size_t size);
- void* CpuMemAlloc(size_t size, size_t align, bool zero);
+ address CpuMemAlloc(size_t size, size_t align, bool zero);
bool CpuMemCopy(void *dst, size_t offset, const void* src, size_t size);
@@ -136,9 +127,9 @@ private:
void GpuMemFree(void *ptr, size_t size = 0);
- ORCAHSALoaderContext(const ORCAHSALoaderContext &c);
+ PALHSALoaderContext(const PALHSALoaderContext &c);
- ORCAHSALoaderContext& operator=(const ORCAHSALoaderContext &c);
+ PALHSALoaderContext& operator=(const PALHSALoaderContext &c);
pal::HSAILProgram* program_;
};
@@ -160,6 +151,9 @@ public:
void addGlobalStore(Memory* mem) { globalStores_.push_back(mem); }
+ void setCodeObjects(Memory* codeGpu, address codeCpu)
+ { codeSegGpu_ = codeGpu; codeSegCpu_ = codeCpu; }
+
const std::vector& globalStores() const { return globalStores_; }
//! Return a typecasted GPU device
@@ -182,9 +176,24 @@ public:
//! Returns TRUE if the program just compiled
bool isNull() const { return isNull_; }
+ //! Returns TRUE if the program used internally by runtime
+ bool isInternal() const { return internal_; }
+
//! Returns TRUE if the program contains static samplers
bool isStaticSampler() const { return (staticSamplers_.size() != 0); }
+ //! Returns code segement on GPU
+ const Memory& codeSegGpu() const { return *codeSegGpu_; }
+
+ //! Returns code segement on CPU
+ address codeSegCpu() const { return codeSegCpu_; }
+
+ //! Returns CPU address for a kernel
+ uint64_t findHostKernelAddress(uint64_t devAddr) const
+ {
+ return loader_->FindHostAddress(devAddr);
+ }
+
protected:
//! pre-compile setup for GPU
virtual bool initBuild(amd::option::Options* options);
@@ -270,17 +279,20 @@ protected:
aclBinaryOptions binOpts_; //!< Binary options to create aclBinary
std::vector globalStores_; //!< Global memory for the program
Memory* kernels_; //!< Table with kernel object pointers
+ Memory* codeSegGpu_; //!< GPU memory with code objects
+ address codeSegCpu_; //!< CPU memory with code objects
uint maxScratchRegs_; //!< Maximum number of scratch regs used in the program by individual kernel
std::list staticSamplers_; //!< List od internal static samplers
union {
struct {
uint32_t isNull_ : 1; //!< Null program no memory allocations
+ uint32_t internal_ : 1; //!< Internal blit program
};
uint32_t flags_; //!< Program flags
};
amd::hsa::loader::Loader* loader_; //!< Loader object
amd::hsa::loader::Executable* executable_; //!< Executable for HSA Loader
- ORCAHSALoaderContext loaderContext_; //!< Context for HSA Loader
+ PALHSALoaderContext loaderContext_; //!< Context for HSA Loader
};
#if defined(WITH_LIGHTNING_COMPILER)
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
index 615970b0e7..d40cac77d4 100644
--- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
@@ -3300,7 +3300,7 @@ VirtualGPU::buildKernelInfo(const HSAILKernel& hsaKernel,
kernelInfo.pAqlQueuePtr = reinterpret_cast(hsaQueueMem_->vmAddress());
// Get the address of the kernel code and its size for CPU access
- pal::Memory* aqlCode = hsaKernel.gpuAqlCode();
+/* pal::Memory* aqlCode = hsaKernel.gpuAqlCode();
if (nullptr != aqlCode) {
address aqlCodeAddr = static_cast(aqlCode->map(nullptr, 0));
dbgManager->setKernelCodeInfo(aqlCodeAddr, hsaKernel.aqlCodeSize());
@@ -3309,7 +3309,7 @@ VirtualGPU::buildKernelInfo(const HSAILKernel& hsaKernel,
else {
dbgManager->setKernelCodeInfo(nullptr, 0);
}
-
+*/
kernelInfo.trapPresent = false;
kernelInfo.trapHandler = nullptr;
kernelInfo.trapHandlerBuffer = nullptr;