diff --git a/rocclr/runtime/device/devkernel.cpp b/rocclr/runtime/device/devkernel.cpp index 38862dda44..33438dfc8f 100644 --- a/rocclr/runtime/device/devkernel.cpp +++ b/rocclr/runtime/device/devkernel.cpp @@ -799,6 +799,11 @@ void Kernel::InitParameters(const KernelMD& kernelMD, uint32_t argBufferSize) { // Allocate the hidden arguments, but abstraction layer will skip them if (isHidden) { + + if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::HiddenCompletionAction) { + setDynamicParallelFlag(true); + } + offset = amd::alignUp(offset, alignment); desc.offset_ = offset; desc.size_ = size; diff --git a/rocclr/runtime/device/pal/paldevice.cpp b/rocclr/runtime/device/pal/paldevice.cpp index 95ac784907..a16eef782b 100644 --- a/rocclr/runtime/device/pal/paldevice.cpp +++ b/rocclr/runtime/device/pal/paldevice.cpp @@ -2179,15 +2179,24 @@ bool Device::createBlitProgram() { // Delayed compilation due to brig_loader memory allocation const char* scheduler = nullptr; const char* ocl20 = nullptr; -#if !defined(WITH_LIGHTNING_COMPILER) + std::string sch = SchedulerSourceCode; if (settings().oclVersion_ >= OpenCL20) { size_t loc = sch.find("%s"); sch.replace(loc, 2, iDev()->GetDispatchKernelSource()); +#if defined(WITH_LIGHTNING_COMPILER) + // For LC, replace "amd_scheduler" with "amd_scheduler_pal" + static const char AmdScheduler[] = "amd_scheduler"; + static const char AmdSchedulerPal[] = "amd_scheduler_pal"; + loc = sch.find(AmdScheduler); + sch.replace(loc, strlen(AmdScheduler), AmdSchedulerPal); + loc = sch.find(AmdScheduler, (loc + strlen(AmdSchedulerPal))); + sch.replace(loc, strlen(AmdScheduler), AmdSchedulerPal); +#endif scheduler = sch.c_str(); ocl20 = "-cl-std=CL2.0"; } -#endif // !defined(WITH_LIGHTNING_COMPILER) + blitProgram_ = new BlitProgram(context_); // Create blit programs if (blitProgram_ == nullptr || !blitProgram_->create(this, scheduler, ocl20)) { diff --git a/rocclr/runtime/device/pal/palkernel.cpp b/rocclr/runtime/device/pal/palkernel.cpp index 2b2df120ed..504285f256 100644 --- a/rocclr/runtime/device/pal/palkernel.cpp +++ b/rocclr/runtime/device/pal/palkernel.cpp @@ -421,6 +421,26 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) { workGroupInfo_.compileVecTypeHint_ = kernelMD->mAttrs.mVecTypeHint.c_str(); } + if (!kernelMD->mAttrs.mRuntimeHandle.empty()) { + hsa_agent_t agent; + agent.handle = 1; + amd::hsa::loader::Symbol* rth_symbol; + + // Get the runtime handle symbol GPU address + rth_symbol = prog_.GetSymbol(const_cast(kernelMD->mAttrs.mRuntimeHandle.c_str()), + const_cast(&agent)); + uint64_t symbol_address; + rth_symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &symbol_address); + + // Copy the kernel_object pointer to the runtime handle symbol GPU address + const Memory& codeSegGpu = prog_.codeSegGpu(); + uint64_t offset = symbol_address - codeSegGpu.vmAddress(); + uint64_t kernel_object = gpuAqlCode(); + VirtualGPU* gpu = codeSegGpu.dev().xferQueue(); + + codeSegGpu.writeRawData(*gpu, offset, 8, &kernel_object, true); + } + // Copy wavefront size workGroupInfo_.wavefrontSize_ = dev().info().wavefrontWidth_; diff --git a/rocclr/runtime/device/pal/palprogram.cpp b/rocclr/runtime/device/pal/palprogram.cpp index f1b920c257..2d5c57a3de 100644 --- a/rocclr/runtime/device/pal/palprogram.cpp +++ b/rocclr/runtime/device/pal/palprogram.cpp @@ -1592,11 +1592,6 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s std::max(static_cast(kernel->workGroupInfo()->scratchRegs_), maxScratchRegs_); } - // Allocate kernel table for device enqueuing - if (!isNull() && false /*dynamicParallelism*/ && !allocKernelTable()) { - return false; - } - // Get the list of global variables std::vector glbVarNames; status = executable_->IterateSymbols(GetGlobalVarNamesCallback, &glbVarNames); diff --git a/rocclr/runtime/device/pal/palprogram.hpp b/rocclr/runtime/device/pal/palprogram.hpp index e7046e285c..e89b8f5ca4 100644 --- a/rocclr/runtime/device/pal/palprogram.hpp +++ b/rocclr/runtime/device/pal/palprogram.hpp @@ -181,6 +181,11 @@ class HSAILProgram : public device::Program { //! Global variables are a part of the code segment bool GlobalVariables() const { return globalVars_; } + //! Get symbol by name + amd::hsa::loader::Symbol* GetSymbol(const char* symbol_name, const hsa_agent_t *agent) const { + return executable_->GetSymbol(symbol_name, agent); + } + protected: //! pre-compile setup for GPU virtual bool initBuild(amd::option::Options* options); diff --git a/rocclr/runtime/device/pal/palvirtual.cpp b/rocclr/runtime/device/pal/palvirtual.cpp index 2a40468b6c..438b43fede 100644 --- a/rocclr/runtime/device/pal/palvirtual.cpp +++ b/rocclr/runtime/device/pal/palvirtual.cpp @@ -1894,18 +1894,10 @@ void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQ print << wraps[i].aql.grid_size_y << ", "; print << wraps[i].aql.grid_size_z << "]\n"; - uint64_t* kernels = - (uint64_t*)(const_cast(hsaKernel.prog().kernelTable())->map(this)); - for (j = 0; j < hsaKernel.prog().kernels().size(); ++j) { - if (kernels[j] == wraps[i].aql.kernel_object) { - break; - } - } - const_cast(hsaKernel.prog().kernelTable())->unmap(this); HSAILKernel* child = nullptr; for (auto it = hsaKernel.prog().kernels().begin(); it != hsaKernel.prog().kernels().end(); ++it) { - if (j == static_cast(it->second)->index()) { + if (wraps[i].aql.kernel_object == static_cast(it->second)->gpuAqlCode()) { child = static_cast(it->second); } } @@ -1996,14 +1988,15 @@ bool VirtualGPU::PreDeviceEnqueue( } *vmDefQueue = (*gpuDefQueue)->virtualQueue_->vmAddress(); - (*gpuDefQueue)->writeVQueueHeader(*this, hsaKernel.prog().kernelTable()->vmAddress()); + (*gpuDefQueue)->writeVQueueHeader(*this, hsaKernel.prog().kernelTable()); + // Acquire USWC memory for the scheduler parameters (*gpuDefQueue)->schedParams_ = &xferWrite().Acquire(sizeof(SchedulerParam)); // Add memory handles before the actual dispatch addVmMemory((*gpuDefQueue)->virtualQueue_); addVmMemory((*gpuDefQueue)->schedParams_); - addVmMemory(hsaKernel.prog().kernelTable()); + return true; } @@ -3252,10 +3245,15 @@ amd::Memory* VirtualGPU::createBufferFromImage(amd::Memory& amdImage) { return mem; } -void VirtualGPU::writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable) { - const static bool Wait = true; - vqHeader_->kernel_table = kernelTable; - virtualQueue_->writeRawData(hostQ, 0, sizeof(AmdVQueueHeader), vqHeader_, Wait); +void VirtualGPU::writeVQueueHeader(VirtualGPU& hostQ, const Memory* kernelTable) { + if (nullptr == kernelTable) { + vqHeader_->kernel_table = 0; + } else { + vqHeader_->kernel_table = kernelTable->vmAddress(); + addVmMemory(kernelTable); + } + + virtualQueue_->writeRawData(hostQ, 0, sizeof(AmdVQueueHeader), vqHeader_, true); } void VirtualGPU::buildKernelInfo(const HSAILKernel& hsaKernel, hsa_kernel_dispatch_packet_t* aqlPkt, diff --git a/rocclr/runtime/device/pal/palvirtual.hpp b/rocclr/runtime/device/pal/palvirtual.hpp index 221b98b36e..85fc889e6f 100644 --- a/rocclr/runtime/device/pal/palvirtual.hpp +++ b/rocclr/runtime/device/pal/palvirtual.hpp @@ -423,7 +423,7 @@ class VirtualGPU : public device::VirtualDevice { Memory* vQueue() const { return virtualQueue_; } //! Update virtual queue header - void writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable); + void writeVQueueHeader(VirtualGPU& hostQ, const Memory* kernelTable); //! Returns TRUE if virtual queue was successfully allocatted bool createVirtualQueue(uint deviceQueueSize //!< Device queue size