From bc18cf7a72cecdfe9783dd070f074bfc701acf6b Mon Sep 17 00:00:00 2001 From: foreman Date: Mon, 24 Oct 2016 15:25:54 -0400 Subject: [PATCH] P4 to Git Change 1330796 by gandryey@gera-w8 on 2016/10/24 15:12:41 SWDEV-86035 - Add PAL backend to OpenCL - Use loader for the code objects loading and avoid allocation of each individual kernel Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/Makefile#17 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#13 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#6 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#13 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.hpp#10 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#33 edit [ROCm/clr commit: 4e1f32bd714cdd631a122668b1702f142fffe476] --- .../rocclr/runtime/device/pal/palkernel.cpp | 33 +--- .../rocclr/runtime/device/pal/palkernel.hpp | 8 +- .../rocclr/runtime/device/pal/palprogram.cpp | 180 ++++++++++++++---- .../rocclr/runtime/device/pal/palprogram.hpp | 54 ++++-- .../rocclr/runtime/device/pal/palvirtual.cpp | 4 +- 5 files changed, 185 insertions(+), 94 deletions(-) diff --git a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp index b639715f00..e45a4ef514 100644 --- a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp @@ -377,11 +377,11 @@ HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol *sym) if (!sym) { return false; } - uint64_t akc_addr = 0; - if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, reinterpret_cast(&akc_addr))) { + if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, reinterpret_cast(&code_))) { return false; } - amd_kernel_code_t *akc = reinterpret_cast(akc_addr); + + amd_kernel_code_t *akc = reinterpret_cast(prog().findHostKernelAddress(code_)); cpuAqlCode_ = akc; if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE, reinterpret_cast(&codeSize_))) { return false; @@ -390,22 +390,6 @@ HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol *sym) if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_ALIGN, reinterpret_cast(&akc_align))) { return false; } - // Allocate HW resources for the real program only - if (!prog().isNull()) { - code_ = new Memory(dev(), amd::alignUp(codeSize_, akc_align)); - Resource::MemoryType type = Resource::Local; - - // Initialize kernel ISA code - if (code_ && code_->create(type)) { - constexpr bool WaitForUpload = true; - code_->writeRawData(*code_->dev().xferQueue(), 0, codeSize_, - reinterpret_cast(akc), WaitForUpload); - } - else { - LogError("Failed to allocate ISA code!"); - return false; - } - } assert((akc->workitem_private_segment_byte_size & 3) == 0 && "Scratch must be DWORD aligned"); @@ -591,9 +575,8 @@ HSAILKernel::HSAILKernel(std::string name, , dev_(prog->dev()) , prog_(*prog) , index_(0) - , code_(nullptr) + , code_(0) , codeSize_(0) - , hwMetaData_(nullptr) , extraArgumentsNum_(extraArgsNum) , waveLimiter_(this, (prog->isNull() ? 1 : dev().properties().gfxipProperties.shaderCore.numCusPerShaderArray) * dev().hwInfo()->simdPerCU_) @@ -608,10 +591,6 @@ HSAILKernel::~HSAILKernel() delete arg; arguments_.pop_back(); } - - delete [] hwMetaData_; - - delete code_; } bool @@ -1217,7 +1196,7 @@ HSAILKernel::loadArguments( // Initialize kernel ISA and execution buffer requirements hsaDisp->private_segment_size = spillSegSize(); hsaDisp->group_segment_size = ldsAddress - ldsSize(); - hsaDisp->kernel_object = gpuAqlCode()->vmAddress(); + hsaDisp->kernel_object = gpuAqlCode(); ConstBuffer* cb = gpu.constBufs_[0]; cb->uploadDataToHw(argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t)); @@ -1228,7 +1207,7 @@ HSAILKernel::loadArguments( hsaDisp->completion_signal.handle = 0; memList.push_back(cb); - memList.push_back(gpuAqlCode()); + memList.push_back(&prog().codeSegGpu()); for (pal::Memory * mem : prog().globalStores()) { memList.push_back(mem); } diff --git a/projects/clr/rocclr/runtime/device/pal/palkernel.hpp b/projects/clr/rocclr/runtime/device/pal/palkernel.hpp index c4fc7c0c74..581ea3adae 100644 --- a/projects/clr/rocclr/runtime/device/pal/palkernel.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palkernel.hpp @@ -154,7 +154,7 @@ public: const amd_kernel_code_t* cpuAqlCode() const { return cpuAqlCode_; } //! Returns memory object with AQL code - pal::Memory* gpuAqlCode() const { return code_; } + uint64_t gpuAqlCode() const { return code_; } //! Returns size of AQL code size_t aqlCodeSize() const { return codeSize_; } @@ -249,12 +249,10 @@ protected: std::vector printf_; //!< Format strings for GPU printf support uint index_; //!< Kernel index in the program - pal::Memory* code_; //!< Memory object with ISA code + uint64_t code_; //!< GPU memory pointer to the kernel size_t codeSize_; //!< Size of ISA code - char* hwMetaData_; //!< SI metadata - - uint extraArgumentsNum_; //! Number of extra (hidden) kernel arguments + uint extraArgumentsNum_; //! Number of extra (hidden) kernel arguments union Flags { struct { diff --git a/projects/clr/rocclr/runtime/device/pal/palprogram.cpp b/projects/clr/rocclr/runtime/device/pal/palprogram.cpp index 8fe8796525..dd564702bc 100644 --- a/projects/clr/rocclr/runtime/device/pal/palprogram.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palprogram.cpp @@ -34,6 +34,8 @@ HSAILProgram::HSAILProgram(Device& device) , binaryElf_(nullptr) , rawBinary_(nullptr) , kernels_(nullptr) + , codeSegGpu_(nullptr) + , codeSegCpu_(nullptr) , maxScratchRegs_(0) , flags_(0) , executable_(nullptr) @@ -54,6 +56,8 @@ HSAILProgram::HSAILProgram(NullDevice& device) , binaryElf_(nullptr) , rawBinary_(nullptr) , kernels_(nullptr) + , codeSegGpu_(nullptr) + , codeSegCpu_(nullptr) , maxScratchRegs_(0) , flags_(0) , executable_(nullptr) @@ -93,6 +97,8 @@ HSAILProgram::~HSAILProgram() } delete kernels_; amd::hsa::loader::Loader::Destroy(loader_); + assert((codeSegGpu_ == nullptr) && "Loader didn't destroy code!"); + assert((codeSegCpu_ == nullptr) && "Loader didn't destroy code!"); } bool @@ -470,6 +476,9 @@ HSAILProgram::linkImpl(amd::option::Options* options) aclType continueCompileFrom = ACL_TYPE_LLVMIR_BINARY; bool finalize = true; bool hsaLoad = true; + internal_ = (compileOptions_.find("-cl-internal-kernel") != + std::string::npos) ? true : false; + // If !binaryElf_ then program must have been created using clCreateProgramWithBinary if (!binaryElf_) { @@ -545,6 +554,11 @@ HSAILProgram::linkImpl(amd::option::Options* options) buildLog_ += "Error: AMD HSA Code Object loading failed.\n"; return false; } + status = executable_->Freeze(nullptr); + if (status != HSA_STATUS_SUCCESS) { + buildLog_ += "Error: AMD HSA Code Object freeze failed.\n"; + return false; + } } size_t kernelNamesSize = 0; errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr, nullptr, &kernelNamesSize); @@ -687,8 +701,7 @@ HSAILProgram::allocKernelTable() kernels_->map(nullptr, pal::Resource::WriteOnly)); for (auto& it : kernels()) { HSAILKernel* kernel = static_cast(it.second); - table[kernel->index()] = static_cast( - kernel->gpuAqlCode()->vmAddress()); + table[kernel->index()] = static_cast(kernel->gpuAqlCode()); } kernels_->unmap(nullptr); } @@ -699,10 +712,7 @@ void HSAILProgram::fillResListWithKernels( std::vector& memList) const { - for (auto& it : kernels()) { - memList.push_back( - static_cast(it.second)->gpuAqlCode()); - } + memList.push_back(&codeSegGpu()); } const aclTargetInfo & @@ -749,7 +759,7 @@ HSAILProgram::saveBinaryAndSetType(type_t type) return true; } -hsa_isa_t ORCAHSALoaderContext::IsaFromName(const char *name) { +hsa_isa_t PALHSALoaderContext::IsaFromName(const char *name) { hsa_isa_t isa = {0}; if (!strcmp(Gfx700, name)) { isa.handle = gfx700; return isa; } if (!strcmp(Gfx701, name)) { isa.handle = gfx701; return isa; } @@ -762,7 +772,7 @@ hsa_isa_t ORCAHSALoaderContext::IsaFromName(const char *name) { return isa; } -bool ORCAHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) { +bool PALHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) { switch (program_->dev().hwInfo()->gfxipVersion_) { default: LogError("Unsupported gfxip version"); @@ -785,7 +795,7 @@ bool ORCAHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) } } -void* ORCAHSALoaderContext::SegmentAlloc(amdgpu_hsa_elf_segment_t segment, +void* PALHSALoaderContext::SegmentAlloc(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, size_t size, size_t align, bool zero) { assert(size); assert(align); @@ -795,13 +805,13 @@ void* ORCAHSALoaderContext::SegmentAlloc(amdgpu_hsa_elf_segment_t segment, case AMDGPU_HSA_SEGMENT_READONLY_AGENT: return AgentGlobalAlloc(agent, size, align, zero); case AMDGPU_HSA_SEGMENT_CODE_AGENT: - return KernelCodeAlloc(agent, size, align, zero); + return KernelCodeAlloc(size, align, zero); default: assert(false); return 0; } } -bool ORCAHSALoaderContext::SegmentCopy(amdgpu_hsa_elf_segment_t segment, +bool PALHSALoaderContext::SegmentCopy(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* dst, size_t offset, const void* src, size_t size) { switch (segment) { case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM: @@ -815,8 +825,9 @@ bool ORCAHSALoaderContext::SegmentCopy(amdgpu_hsa_elf_segment_t segment, } } -void ORCAHSALoaderContext::SegmentFree(amdgpu_hsa_elf_segment_t segment, - hsa_agent_t agent, void* seg, size_t size) { +void PALHSALoaderContext::SegmentFree( + amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size) +{ switch (segment) { case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM: case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT: @@ -827,25 +838,72 @@ void ORCAHSALoaderContext::SegmentFree(amdgpu_hsa_elf_segment_t segment, } } -void* ORCAHSALoaderContext::SegmentAddress(amdgpu_hsa_elf_segment_t segment, - hsa_agent_t agent, void* seg, size_t offset) { +void* PALHSALoaderContext::SegmentAddress( + amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t offset) +{ assert(seg); switch (segment) { case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM: case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT: case AMDGPU_HSA_SEGMENT_READONLY_AGENT: { + case AMDGPU_HSA_SEGMENT_CODE_AGENT: if (!program_->isNull()) { pal::Memory *gpuMem = reinterpret_cast(seg); return reinterpret_cast(gpuMem->vmAddress() + offset); } + else { + return reinterpret_cast
(seg) + offset; + } } - case AMDGPU_HSA_SEGMENT_CODE_AGENT: return (char*) seg + offset; default: assert(false); return nullptr; } } -hsa_status_t ORCAHSALoaderContext::SamplerCreate( +void* PALHSALoaderContext::SegmentHostAddress( + amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t offset) +{ + void* host = nullptr; + assert(seg); + switch (segment) { + case AMDGPU_HSA_SEGMENT_CODE_AGENT: + host = program_->codeSegCpu() + offset; + break; + case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM: + case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT: + case AMDGPU_HSA_SEGMENT_READONLY_AGENT: + default: + break; + } + return host; +} + +bool PALHSALoaderContext::SegmentFreeze( + amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size) +{ + assert(seg); + switch (segment) { + case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM: + case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT: + case AMDGPU_HSA_SEGMENT_READONLY_AGENT: + return true; + case AMDGPU_HSA_SEGMENT_CODE_AGENT: { + if (program_->isNull()) { + return true; + } + + const pal::Memory& mem = program_->codeSegGpu(); + constexpr bool WaitForCopy = true; + mem.writeRawData(*mem.dev().xferQueue(), 0, size, program_->codeSegCpu(), WaitForCopy); + return true; + } + default: + assert(false); + return false; + } +} + +hsa_status_t PALHSALoaderContext::SamplerCreate( hsa_agent_t agent, const hsa_ext_sampler_descriptor_t *sampler_descriptor, hsa_ext_sampler_t *sampler_handle) @@ -897,8 +955,9 @@ hsa_status_t ORCAHSALoaderContext::SamplerCreate( return HSA_STATUS_SUCCESS; } -hsa_status_t ORCAHSALoaderContext::SamplerDestroy( - hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) { +hsa_status_t PALHSALoaderContext::SamplerDestroy( + hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) +{ if (!agent.handle) { return HSA_STATUS_ERROR_INVALID_AGENT; } @@ -908,7 +967,8 @@ hsa_status_t ORCAHSALoaderContext::SamplerDestroy( return HSA_STATUS_SUCCESS; } -void* ORCAHSALoaderContext::CpuMemAlloc(size_t size, size_t align, bool zero) { +address PALHSALoaderContext::CpuMemAlloc(size_t size, size_t align, bool zero) +{ assert(size); assert(align); assert(sizeof(void*) == 8 || sizeof(void*) == 4); @@ -917,26 +977,21 @@ void* ORCAHSALoaderContext::CpuMemAlloc(size_t size, size_t align, bool zero) { if (zero) { memset(ptr, 0, size); } - return ptr; + return reinterpret_cast
(ptr); } -bool ORCAHSALoaderContext::CpuMemCopy(void *dst, size_t offset, const void* src, size_t size) { - if (!dst || !src || dst == src) { - return false; - } - if (0 == size) { - return true; - } - amd::Os::fastMemcpy((char*)dst + offset, src, size); - return true; +bool PALHSALoaderContext::CpuMemCopy(void *dst, size_t offset, const void* src, size_t size) +{ + amd::Os::fastMemcpy((char*)dst + offset, src, size); + return true; } -void* ORCAHSALoaderContext::GpuMemAlloc(size_t size, size_t align, bool zero) { +void* PALHSALoaderContext::GpuMemAlloc(size_t size, size_t align, bool zero) { assert(size); assert(align); assert(sizeof(void*) == 8 || sizeof(void*) == 4); if (program_->isNull()) { - return new char[size]; + return CpuMemAlloc(size, align, zero); } pal::Memory* mem = new pal::Memory(program_->dev(), amd::alignUp(size, align)); @@ -945,7 +1000,7 @@ void* ORCAHSALoaderContext::GpuMemAlloc(size_t size, size_t align, bool zero) { return nullptr; } assert(program_->dev().xferQueue()); - if (zero) { + if (zero && !program_->isInternal()) { char pattern = 0; program_->dev().xferMgr().fillBuffer(*mem, &pattern, sizeof(pattern), amd::Coord3D(0), amd::Coord3D(size)); } @@ -954,7 +1009,7 @@ void* ORCAHSALoaderContext::GpuMemAlloc(size_t size, size_t align, bool zero) { return mem; } -bool ORCAHSALoaderContext::GpuMemCopy(void *dst, size_t offset, const void *src, size_t size) { +bool PALHSALoaderContext::GpuMemCopy(void *dst, size_t offset, const void *src, size_t size) { if (!dst || !src || dst == src) { return false; } @@ -962,7 +1017,7 @@ bool ORCAHSALoaderContext::GpuMemCopy(void *dst, size_t offset, const void *src, return true; } if (program_->isNull()) { - memcpy(reinterpret_cast
(dst) + offset, src, size); + CpuMemCopy(dst, offset, src, size); return true; } assert(program_->dev().xferQueue()); @@ -972,16 +1027,62 @@ bool ORCAHSALoaderContext::GpuMemCopy(void *dst, size_t offset, const void *src, return true; } -void ORCAHSALoaderContext::GpuMemFree(void *ptr, size_t size) +void PALHSALoaderContext::GpuMemFree(void *ptr, size_t size) { if (program_->isNull()) { - delete[] reinterpret_cast(ptr); + CpuMemFree(ptr, size); } else { delete reinterpret_cast(ptr); } } +void* PALHSALoaderContext::KernelCodeAlloc( + size_t size, size_t align, bool zero) +{ + address host = CpuMemAlloc(size, align, zero); + pal::Memory* mem = nullptr; + + if (!program_->isNull()) { + mem = new pal::Memory(program_->dev(), amd::alignUp(size, align)); + if (!mem || !mem->create(pal::Resource::Local)) { + delete mem; + mem = nullptr; + } + } + program_->setCodeObjects(mem, host); + return ((host == nullptr || mem == nullptr) ? nullptr : mem); +} + +bool PALHSALoaderContext::KernelCodeCopy(void *dst, size_t offset, const void *src, size_t size) +{ + if (!dst || !src || dst == src) { + return false; + } + if (0 == size) { + return true; + } + if (program_->isNull()) { + return CpuMemCopy(dst, offset, src, size); + } + assert(program_->dev().xferQueue()); + pal::Memory* mem = reinterpret_cast(dst); + if (mem == &program_->codeSegGpu()) { + return CpuMemCopy(program_->codeSegCpu(), offset, src, size); + } + assert(!"The segement doesn't match code segment in the program!"); + return false; +} + +void PALHSALoaderContext::KernelCodeFree(void *ptr, size_t size) +{ + CpuMemFree(program_->codeSegCpu(), size); + if (!program_->isNull()) { + delete reinterpret_cast(ptr); + } + program_->setCodeObjects(nullptr, nullptr); +} + #if defined(WITH_LIGHTNING_COMPILER) static hsa_status_t @@ -1019,6 +1120,8 @@ bool LightningProgram::linkImpl(amd::option::Options *options) { using namespace amd::opencl_driver; + internal_ = (compileOptions_.find("-cl-internal-kernel") != + std::string::npos) ? true : false; aclType continueCompileFrom = llvmBinary_.empty() ? getNextCompilationStageFromBinary(options) @@ -1270,12 +1373,11 @@ LightningProgram::setKernels( return false; } - /* FIXME_lmoriche: We need to call this! status = executable_->Freeze(nullptr); if (status != HSA_STATUS_SUCCESS) { buildLog_ += "Error: Freezing the executable failed: "; return false; - }*/ + } size_t progvarsTotalSize = 0; diff --git a/projects/clr/rocclr/runtime/device/pal/palprogram.hpp b/projects/clr/rocclr/runtime/device/pal/palprogram.hpp index f1b58cec56..83531961f4 100644 --- a/projects/clr/rocclr/runtime/device/pal/palprogram.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palprogram.hpp @@ -34,11 +34,11 @@ namespace pal { using namespace amd::hsa::loader; class HSAILProgram; -class ORCAHSALoaderContext final: public Context { +class PALHSALoaderContext final: public Context { public: - ORCAHSALoaderContext(HSAILProgram* program): program_(program) {} + PALHSALoaderContext(HSAILProgram* program): program_(program) {} - virtual ~ORCAHSALoaderContext() {} + virtual ~PALHSALoaderContext() {} hsa_isa_t IsaFromName(const char *name) override; @@ -58,12 +58,10 @@ public: hsa_agent_t agent, void* seg, size_t offset) override; void* SegmentHostAddress(amdgpu_hsa_elf_segment_t segment, - hsa_agent_t agent, void* seg, size_t offset) override { - return nullptr; - } + hsa_agent_t agent, void* seg, size_t offset) override; bool SegmentFreeze(amdgpu_hsa_elf_segment_t segment, - hsa_agent_t agent, void* seg, size_t size) override { return false; } + hsa_agent_t agent, void* seg, size_t size) override; bool ImageExtensionSupported() override { return false; } @@ -109,20 +107,13 @@ private: GpuMemFree(ptr, size); } - void* KernelCodeAlloc( - hsa_agent_t agent, size_t size, size_t align, bool zero) { - return CpuMemAlloc(size, align, zero); - } + void* KernelCodeAlloc(size_t size, size_t align, bool zero); - bool KernelCodeCopy(void *dst, size_t offset, const void *src, size_t size) { - return CpuMemCopy(dst, offset, src, size); - } + bool KernelCodeCopy(void *dst, size_t offset, const void *src, size_t size); - void KernelCodeFree(void *ptr, size_t size) { - CpuMemFree(ptr, size); - } + void KernelCodeFree(void *ptr, size_t size); - void* CpuMemAlloc(size_t size, size_t align, bool zero); + address CpuMemAlloc(size_t size, size_t align, bool zero); bool CpuMemCopy(void *dst, size_t offset, const void* src, size_t size); @@ -136,9 +127,9 @@ private: void GpuMemFree(void *ptr, size_t size = 0); - ORCAHSALoaderContext(const ORCAHSALoaderContext &c); + PALHSALoaderContext(const PALHSALoaderContext &c); - ORCAHSALoaderContext& operator=(const ORCAHSALoaderContext &c); + PALHSALoaderContext& operator=(const PALHSALoaderContext &c); pal::HSAILProgram* program_; }; @@ -160,6 +151,9 @@ public: void addGlobalStore(Memory* mem) { globalStores_.push_back(mem); } + void setCodeObjects(Memory* codeGpu, address codeCpu) + { codeSegGpu_ = codeGpu; codeSegCpu_ = codeCpu; } + const std::vector& globalStores() const { return globalStores_; } //! Return a typecasted GPU device @@ -182,9 +176,24 @@ public: //! Returns TRUE if the program just compiled bool isNull() const { return isNull_; } + //! Returns TRUE if the program used internally by runtime + bool isInternal() const { return internal_; } + //! Returns TRUE if the program contains static samplers bool isStaticSampler() const { return (staticSamplers_.size() != 0); } + //! Returns code segement on GPU + const Memory& codeSegGpu() const { return *codeSegGpu_; } + + //! Returns code segement on CPU + address codeSegCpu() const { return codeSegCpu_; } + + //! Returns CPU address for a kernel + uint64_t findHostKernelAddress(uint64_t devAddr) const + { + return loader_->FindHostAddress(devAddr); + } + protected: //! pre-compile setup for GPU virtual bool initBuild(amd::option::Options* options); @@ -270,17 +279,20 @@ protected: aclBinaryOptions binOpts_; //!< Binary options to create aclBinary std::vector globalStores_; //!< Global memory for the program Memory* kernels_; //!< Table with kernel object pointers + Memory* codeSegGpu_; //!< GPU memory with code objects + address codeSegCpu_; //!< CPU memory with code objects uint maxScratchRegs_; //!< Maximum number of scratch regs used in the program by individual kernel std::list staticSamplers_; //!< List od internal static samplers union { struct { uint32_t isNull_ : 1; //!< Null program no memory allocations + uint32_t internal_ : 1; //!< Internal blit program }; uint32_t flags_; //!< Program flags }; amd::hsa::loader::Loader* loader_; //!< Loader object amd::hsa::loader::Executable* executable_; //!< Executable for HSA Loader - ORCAHSALoaderContext loaderContext_; //!< Context for HSA Loader + PALHSALoaderContext loaderContext_; //!< Context for HSA Loader }; #if defined(WITH_LIGHTNING_COMPILER) diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp index 615970b0e7..d40cac77d4 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp @@ -3300,7 +3300,7 @@ VirtualGPU::buildKernelInfo(const HSAILKernel& hsaKernel, kernelInfo.pAqlQueuePtr = reinterpret_cast(hsaQueueMem_->vmAddress()); // Get the address of the kernel code and its size for CPU access - pal::Memory* aqlCode = hsaKernel.gpuAqlCode(); +/* pal::Memory* aqlCode = hsaKernel.gpuAqlCode(); if (nullptr != aqlCode) { address aqlCodeAddr = static_cast
(aqlCode->map(nullptr, 0)); dbgManager->setKernelCodeInfo(aqlCodeAddr, hsaKernel.aqlCodeSize()); @@ -3309,7 +3309,7 @@ VirtualGPU::buildKernelInfo(const HSAILKernel& hsaKernel, else { dbgManager->setKernelCodeInfo(nullptr, 0); } - +*/ kernelInfo.trapPresent = false; kernelInfo.trapHandler = nullptr; kernelInfo.trapHandlerBuffer = nullptr;