From 41facdb089a76647dd9247e1e0bb1af83caf2839 Mon Sep 17 00:00:00 2001
From: foreman
Date: Tue, 18 Sep 2018 10:42:05 -0400
Subject: [PATCH] P4 to Git Change 1607329 by jatang@jatang_win_pal_lc on
2018/09/18 10:34:41
SWDEV-148809 - Device Enqueue on LC/PAL.
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.cpp#7 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#109 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#66 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#71 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.hpp#28 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#124 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#55 edit
---
rocclr/runtime/device/devkernel.cpp | 5 +++++
rocclr/runtime/device/pal/paldevice.cpp | 13 +++++++++--
rocclr/runtime/device/pal/palkernel.cpp | 20 +++++++++++++++++
rocclr/runtime/device/pal/palprogram.cpp | 5 -----
rocclr/runtime/device/pal/palprogram.hpp | 5 +++++
rocclr/runtime/device/pal/palvirtual.cpp | 28 +++++++++++-------------
rocclr/runtime/device/pal/palvirtual.hpp | 2 +-
7 files changed, 55 insertions(+), 23 deletions(-)
diff --git a/rocclr/runtime/device/devkernel.cpp b/rocclr/runtime/device/devkernel.cpp
index 38862dda44..33438dfc8f 100644
--- a/rocclr/runtime/device/devkernel.cpp
+++ b/rocclr/runtime/device/devkernel.cpp
@@ -799,6 +799,11 @@ void Kernel::InitParameters(const KernelMD& kernelMD, uint32_t argBufferSize) {
// Allocate the hidden arguments, but abstraction layer will skip them
if (isHidden) {
+
+ if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::HiddenCompletionAction) {
+ setDynamicParallelFlag(true);
+ }
+
offset = amd::alignUp(offset, alignment);
desc.offset_ = offset;
desc.size_ = size;
diff --git a/rocclr/runtime/device/pal/paldevice.cpp b/rocclr/runtime/device/pal/paldevice.cpp
index 95ac784907..a16eef782b 100644
--- a/rocclr/runtime/device/pal/paldevice.cpp
+++ b/rocclr/runtime/device/pal/paldevice.cpp
@@ -2179,15 +2179,24 @@ bool Device::createBlitProgram() {
// Delayed compilation due to brig_loader memory allocation
const char* scheduler = nullptr;
const char* ocl20 = nullptr;
-#if !defined(WITH_LIGHTNING_COMPILER)
+
std::string sch = SchedulerSourceCode;
if (settings().oclVersion_ >= OpenCL20) {
size_t loc = sch.find("%s");
sch.replace(loc, 2, iDev()->GetDispatchKernelSource());
+#if defined(WITH_LIGHTNING_COMPILER)
+ // For LC, replace "amd_scheduler" with "amd_scheduler_pal"
+ static const char AmdScheduler[] = "amd_scheduler";
+ static const char AmdSchedulerPal[] = "amd_scheduler_pal";
+ loc = sch.find(AmdScheduler);
+ sch.replace(loc, strlen(AmdScheduler), AmdSchedulerPal);
+ loc = sch.find(AmdScheduler, (loc + strlen(AmdSchedulerPal)));
+ sch.replace(loc, strlen(AmdScheduler), AmdSchedulerPal);
+#endif
scheduler = sch.c_str();
ocl20 = "-cl-std=CL2.0";
}
-#endif // !defined(WITH_LIGHTNING_COMPILER)
+
blitProgram_ = new BlitProgram(context_);
// Create blit programs
if (blitProgram_ == nullptr || !blitProgram_->create(this, scheduler, ocl20)) {
diff --git a/rocclr/runtime/device/pal/palkernel.cpp b/rocclr/runtime/device/pal/palkernel.cpp
index 2b2df120ed..504285f256 100644
--- a/rocclr/runtime/device/pal/palkernel.cpp
+++ b/rocclr/runtime/device/pal/palkernel.cpp
@@ -421,6 +421,26 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
workGroupInfo_.compileVecTypeHint_ = kernelMD->mAttrs.mVecTypeHint.c_str();
}
+ if (!kernelMD->mAttrs.mRuntimeHandle.empty()) {
+ hsa_agent_t agent;
+ agent.handle = 1;
+ amd::hsa::loader::Symbol* rth_symbol;
+
+ // Get the runtime handle symbol GPU address
+ rth_symbol = prog_.GetSymbol(const_cast(kernelMD->mAttrs.mRuntimeHandle.c_str()),
+ const_cast(&agent));
+ uint64_t symbol_address;
+ rth_symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &symbol_address);
+
+ // Copy the kernel_object pointer to the runtime handle symbol GPU address
+ const Memory& codeSegGpu = prog_.codeSegGpu();
+ uint64_t offset = symbol_address - codeSegGpu.vmAddress();
+ uint64_t kernel_object = gpuAqlCode();
+ VirtualGPU* gpu = codeSegGpu.dev().xferQueue();
+
+ codeSegGpu.writeRawData(*gpu, offset, 8, &kernel_object, true);
+ }
+
// Copy wavefront size
workGroupInfo_.wavefrontSize_ = dev().info().wavefrontWidth_;
diff --git a/rocclr/runtime/device/pal/palprogram.cpp b/rocclr/runtime/device/pal/palprogram.cpp
index f1b920c257..2d5c57a3de 100644
--- a/rocclr/runtime/device/pal/palprogram.cpp
+++ b/rocclr/runtime/device/pal/palprogram.cpp
@@ -1592,11 +1592,6 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s
std::max(static_cast(kernel->workGroupInfo()->scratchRegs_), maxScratchRegs_);
}
- // Allocate kernel table for device enqueuing
- if (!isNull() && false /*dynamicParallelism*/ && !allocKernelTable()) {
- return false;
- }
-
// Get the list of global variables
std::vector glbVarNames;
status = executable_->IterateSymbols(GetGlobalVarNamesCallback, &glbVarNames);
diff --git a/rocclr/runtime/device/pal/palprogram.hpp b/rocclr/runtime/device/pal/palprogram.hpp
index e7046e285c..e89b8f5ca4 100644
--- a/rocclr/runtime/device/pal/palprogram.hpp
+++ b/rocclr/runtime/device/pal/palprogram.hpp
@@ -181,6 +181,11 @@ class HSAILProgram : public device::Program {
//! Global variables are a part of the code segment
bool GlobalVariables() const { return globalVars_; }
+ //! Get symbol by name
+ amd::hsa::loader::Symbol* GetSymbol(const char* symbol_name, const hsa_agent_t *agent) const {
+ return executable_->GetSymbol(symbol_name, agent);
+ }
+
protected:
//! pre-compile setup for GPU
virtual bool initBuild(amd::option::Options* options);
diff --git a/rocclr/runtime/device/pal/palvirtual.cpp b/rocclr/runtime/device/pal/palvirtual.cpp
index 2a40468b6c..438b43fede 100644
--- a/rocclr/runtime/device/pal/palvirtual.cpp
+++ b/rocclr/runtime/device/pal/palvirtual.cpp
@@ -1894,18 +1894,10 @@ void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQ
print << wraps[i].aql.grid_size_y << ", ";
print << wraps[i].aql.grid_size_z << "]\n";
- uint64_t* kernels =
- (uint64_t*)(const_cast(hsaKernel.prog().kernelTable())->map(this));
- for (j = 0; j < hsaKernel.prog().kernels().size(); ++j) {
- if (kernels[j] == wraps[i].aql.kernel_object) {
- break;
- }
- }
- const_cast(hsaKernel.prog().kernelTable())->unmap(this);
HSAILKernel* child = nullptr;
for (auto it = hsaKernel.prog().kernels().begin();
it != hsaKernel.prog().kernels().end(); ++it) {
- if (j == static_cast(it->second)->index()) {
+ if (wraps[i].aql.kernel_object == static_cast(it->second)->gpuAqlCode()) {
child = static_cast(it->second);
}
}
@@ -1996,14 +1988,15 @@ bool VirtualGPU::PreDeviceEnqueue(
}
*vmDefQueue = (*gpuDefQueue)->virtualQueue_->vmAddress();
- (*gpuDefQueue)->writeVQueueHeader(*this, hsaKernel.prog().kernelTable()->vmAddress());
+ (*gpuDefQueue)->writeVQueueHeader(*this, hsaKernel.prog().kernelTable());
+
// Acquire USWC memory for the scheduler parameters
(*gpuDefQueue)->schedParams_ = &xferWrite().Acquire(sizeof(SchedulerParam));
// Add memory handles before the actual dispatch
addVmMemory((*gpuDefQueue)->virtualQueue_);
addVmMemory((*gpuDefQueue)->schedParams_);
- addVmMemory(hsaKernel.prog().kernelTable());
+
return true;
}
@@ -3252,10 +3245,15 @@ amd::Memory* VirtualGPU::createBufferFromImage(amd::Memory& amdImage) {
return mem;
}
-void VirtualGPU::writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable) {
- const static bool Wait = true;
- vqHeader_->kernel_table = kernelTable;
- virtualQueue_->writeRawData(hostQ, 0, sizeof(AmdVQueueHeader), vqHeader_, Wait);
+void VirtualGPU::writeVQueueHeader(VirtualGPU& hostQ, const Memory* kernelTable) {
+ if (nullptr == kernelTable) {
+ vqHeader_->kernel_table = 0;
+ } else {
+ vqHeader_->kernel_table = kernelTable->vmAddress();
+ addVmMemory(kernelTable);
+ }
+
+ virtualQueue_->writeRawData(hostQ, 0, sizeof(AmdVQueueHeader), vqHeader_, true);
}
void VirtualGPU::buildKernelInfo(const HSAILKernel& hsaKernel, hsa_kernel_dispatch_packet_t* aqlPkt,
diff --git a/rocclr/runtime/device/pal/palvirtual.hpp b/rocclr/runtime/device/pal/palvirtual.hpp
index 221b98b36e..85fc889e6f 100644
--- a/rocclr/runtime/device/pal/palvirtual.hpp
+++ b/rocclr/runtime/device/pal/palvirtual.hpp
@@ -423,7 +423,7 @@ class VirtualGPU : public device::VirtualDevice {
Memory* vQueue() const { return virtualQueue_; }
//! Update virtual queue header
- void writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable);
+ void writeVQueueHeader(VirtualGPU& hostQ, const Memory* kernelTable);
//! Returns TRUE if virtual queue was successfully allocatted
bool createVirtualQueue(uint deviceQueueSize //!< Device queue size