From d9bdadb67366c3687facaf77fdb303c4f484bc08 Mon Sep 17 00:00:00 2001
From: foreman
Date: Thu, 21 Feb 2019 17:42:08 -0500
Subject: [PATCH] P4 to Git Change 1746912 by cpaquot@cpaquot-ocl-lc-lnx on
2019/02/21 14:16:35
SWDEV-178453 - [HIP] Add extra parameter for sharedMemBytes
Affected files ...
... //depot/stg/opencl/drivers/opencl/api/hip/hip_module.cpp#20 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#129 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#58 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#72 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#21 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/command.cpp#91 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/command.hpp#91 edit
[ROCm/clr commit: a343072c71bd367124c4049d1d32ec180c12651d]
---
projects/clr/rocclr/runtime/device/pal/palvirtual.cpp | 6 +++---
projects/clr/rocclr/runtime/device/pal/palvirtual.hpp | 3 ++-
projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp | 6 +++---
projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp | 3 ++-
projects/clr/rocclr/runtime/platform/command.cpp | 4 ++--
projects/clr/rocclr/runtime/platform/command.hpp | 6 +++++-
6 files changed, 17 insertions(+), 11 deletions(-)
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
index b0a23a45fe..359331cc85 100644
--- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
@@ -2100,7 +2100,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
profilingBegin(vcmd);
// Submit kernel to HW
- if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, &vcmd.event())) {
+ if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, &vcmd.event(), vcmd.sharedMemBytes())) {
vcmd.setStatus(CL_INVALID_OPERATION);
}
@@ -2110,7 +2110,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
// ================================================================================================
bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel,
const_address parameters, bool nativeMem,
- amd::Event* enqueueEvent)
+ amd::Event* enqueueEvent, uint32_t sharedMemBytes)
{
size_t newOffset[3] = { 0, 0, 0 };
size_t newGlobalSize[3] = { 0, 0, 0 };
@@ -2232,7 +2232,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
uint64_t vmParentWrap = 0;
// Program the kernel arguments for the GPU execution
hsa_kernel_dispatch_packet_t* aqlPkt = hsaKernel.loadArguments(
- *this, kernel, tmpSizes, parameters, ldsSize, vmDefQueue, &vmParentWrap);
+ *this, kernel, tmpSizes, parameters, ldsSize + sharedMemBytes, vmDefQueue, &vmParentWrap);
if (nullptr == aqlPkt) {
LogError("Couldn't load kernel arguments");
return false;
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
index a7a6b38ee7..aff4163677 100644
--- a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
@@ -299,7 +299,8 @@ class VirtualGPU : public device::VirtualDevice {
const amd::Kernel& kernel, //!< Kernel for execution
const_address parameters, //!< Parameters for the kernel
bool nativeMem = true, //!< Native memory objects
- amd::Event* enqueueEvent = nullptr //!< Event provided in the enqueue kernel command
+ amd::Event* enqueueEvent = nullptr, //!< Event provided in the enqueue kernel command
+ uint32_t sharedMemBytes = 0 //!< Shared memory size
);
void submitNativeFn(amd::NativeFnCommand& vcmd);
void submitFillMemory(amd::FillMemoryCommand& vcmd);
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
index 38cbde14c9..1c6567adfa 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
@@ -1938,7 +1938,7 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize)
}
bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel,
- const_address parameters, void* eventHandle) {
+ const_address parameters, void* eventHandle, uint32_t sharedMemBytes) {
device::Kernel* devKernel = const_cast(kernel.getDeviceKernel(dev()));
Kernel& gpuKernel = static_cast(*devKernel);
size_t ldsUsage = gpuKernel.WorkgroupGroupSegmentByteSize();
@@ -2113,7 +2113,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
dispatchPacket.workgroup_size_z = sizes.dimensions() > 2 ? local[2] : 1;
dispatchPacket.kernarg_address = argBuffer;
- dispatchPacket.group_segment_size = ldsUsage;
+ dispatchPacket.group_segment_size = ldsUsage + sharedMemBytes;
dispatchPacket.private_segment_size = devKernel->workGroupInfo()->privateMemSize_;
// Dispatch the packet
@@ -2154,7 +2154,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
// Submit kernel to HW
if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(),
- static_cast(as_cl(&vcmd.event())))) {
+ static_cast(as_cl(&vcmd.event())), vcmd.sharedMemBytes())) {
LogError("AQL dispatch failed!");
vcmd.setStatus(CL_INVALID_OPERATION);
}
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp
index 0e9e18981b..7b7d9290b2 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp
@@ -166,7 +166,8 @@ class VirtualGPU : public device::VirtualDevice {
bool submitKernelInternal(const amd::NDRangeContainer& sizes, //!< Workload sizes
const amd::Kernel& kernel, //!< Kernel for execution
const_address parameters, //!< Parameters for the kernel
- void* event_handle //!< Handle to OCL event for debugging
+ void* event_handle, //!< Handle to OCL event for debugging
+ uint32_t sharedMemBytes = 0 //!< Shared memory size
);
void submitNativeFn(amd::NativeFnCommand& cmd);
void submitMarker(amd::Marker& cmd);
diff --git a/projects/clr/rocclr/runtime/platform/command.cpp b/projects/clr/rocclr/runtime/platform/command.cpp
index 99873ac981..19556b9939 100644
--- a/projects/clr/rocclr/runtime/platform/command.cpp
+++ b/projects/clr/rocclr/runtime/platform/command.cpp
@@ -226,8 +226,8 @@ void Command::enqueue() {
const Context& Command::context() const { return queue_->context(); }
NDRangeKernelCommand::NDRangeKernelCommand(HostQueue& queue, const EventWaitList& eventWaitList,
- Kernel& kernel, const NDRangeContainer& sizes)
- : Command(queue, CL_COMMAND_NDRANGE_KERNEL, eventWaitList), kernel_(kernel), sizes_(sizes) {
+ Kernel& kernel, const NDRangeContainer& sizes, uint32_t sharedMemBytes)
+ : Command(queue, CL_COMMAND_NDRANGE_KERNEL, eventWaitList), kernel_(kernel), sizes_(sizes), sharedMemBytes_(sharedMemBytes) {
auto& device = queue.device();
auto devKernel = const_cast(kernel.getDeviceKernel(device));
profilingInfo_.setCallback(devKernel->getProfilingCallback(
diff --git a/projects/clr/rocclr/runtime/platform/command.hpp b/projects/clr/rocclr/runtime/platform/command.hpp
index e979112ada..bad260289e 100644
--- a/projects/clr/rocclr/runtime/platform/command.hpp
+++ b/projects/clr/rocclr/runtime/platform/command.hpp
@@ -752,11 +752,12 @@ class NDRangeKernelCommand : public Command {
Kernel& kernel_;
NDRangeContainer sizes_;
address parameters_;
+ uint32_t sharedMemBytes_;
public:
//! Construct an ExecuteKernel command
NDRangeKernelCommand(HostQueue& queue, const EventWaitList& eventWaitList, Kernel& kernel,
- const NDRangeContainer& sizes);
+ const NDRangeContainer& sizes, uint32_t sharedMemBytes = 0);
virtual void submit(device::VirtualDevice& device) { device.submitKernel(*this); }
@@ -772,6 +773,9 @@ class NDRangeKernelCommand : public Command {
//! Return the kernel NDRange.
const NDRangeContainer& sizes() const { return sizes_; }
+ //! Return the shared memory size
+ uint32_t sharedMemBytes() const { return sharedMemBytes_; }
+
//! Set the local work size.
void setLocalWorkSize(const NDRange& local) { sizes_.local() = local; }