P4 to Git Change 1746912 by cpaquot@cpaquot-ocl-lc-lnx on 2019/02/21 14:16:35

SWDEV-178453 - [HIP] Add extra parameter for sharedMemBytes

Affected files ...

... //depot/stg/opencl/drivers/opencl/api/hip/hip_module.cpp#20 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#129 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#58 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#72 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#21 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/command.cpp#91 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/command.hpp#91 edit


[ROCm/clr commit: a343072c71]
This commit is contained in:
foreman
2019-02-21 17:42:08 -05:00
parent 10520b6487
commit d9bdadb673
6 changed files with 17 additions and 11 deletions
@@ -2100,7 +2100,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
profilingBegin(vcmd);
// Submit kernel to HW
if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, &vcmd.event())) {
if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, &vcmd.event(), vcmd.sharedMemBytes())) {
vcmd.setStatus(CL_INVALID_OPERATION);
}
@@ -2110,7 +2110,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
// ================================================================================================
bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel,
const_address parameters, bool nativeMem,
amd::Event* enqueueEvent)
amd::Event* enqueueEvent, uint32_t sharedMemBytes)
{
size_t newOffset[3] = { 0, 0, 0 };
size_t newGlobalSize[3] = { 0, 0, 0 };
@@ -2232,7 +2232,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
uint64_t vmParentWrap = 0;
// Program the kernel arguments for the GPU execution
hsa_kernel_dispatch_packet_t* aqlPkt = hsaKernel.loadArguments(
*this, kernel, tmpSizes, parameters, ldsSize, vmDefQueue, &vmParentWrap);
*this, kernel, tmpSizes, parameters, ldsSize + sharedMemBytes, vmDefQueue, &vmParentWrap);
if (nullptr == aqlPkt) {
LogError("Couldn't load kernel arguments");
return false;
@@ -299,7 +299,8 @@ class VirtualGPU : public device::VirtualDevice {
const amd::Kernel& kernel, //!< Kernel for execution
const_address parameters, //!< Parameters for the kernel
bool nativeMem = true, //!< Native memory objects
amd::Event* enqueueEvent = nullptr //!< Event provided in the enqueue kernel command
amd::Event* enqueueEvent = nullptr, //!< Event provided in the enqueue kernel command
uint32_t sharedMemBytes = 0 //!< Shared memory size
);
void submitNativeFn(amd::NativeFnCommand& vcmd);
void submitFillMemory(amd::FillMemoryCommand& vcmd);
@@ -1938,7 +1938,7 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize)
}
bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel,
const_address parameters, void* eventHandle) {
const_address parameters, void* eventHandle, uint32_t sharedMemBytes) {
device::Kernel* devKernel = const_cast<device::Kernel*>(kernel.getDeviceKernel(dev()));
Kernel& gpuKernel = static_cast<Kernel&>(*devKernel);
size_t ldsUsage = gpuKernel.WorkgroupGroupSegmentByteSize();
@@ -2113,7 +2113,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
dispatchPacket.workgroup_size_z = sizes.dimensions() > 2 ? local[2] : 1;
dispatchPacket.kernarg_address = argBuffer;
dispatchPacket.group_segment_size = ldsUsage;
dispatchPacket.group_segment_size = ldsUsage + sharedMemBytes;
dispatchPacket.private_segment_size = devKernel->workGroupInfo()->privateMemSize_;
// Dispatch the packet
@@ -2154,7 +2154,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
// Submit kernel to HW
if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(),
static_cast<void*>(as_cl(&vcmd.event())))) {
static_cast<void*>(as_cl(&vcmd.event())), vcmd.sharedMemBytes())) {
LogError("AQL dispatch failed!");
vcmd.setStatus(CL_INVALID_OPERATION);
}
@@ -166,7 +166,8 @@ class VirtualGPU : public device::VirtualDevice {
bool submitKernelInternal(const amd::NDRangeContainer& sizes, //!< Workload sizes
const amd::Kernel& kernel, //!< Kernel for execution
const_address parameters, //!< Parameters for the kernel
void* event_handle //!< Handle to OCL event for debugging
void* event_handle, //!< Handle to OCL event for debugging
uint32_t sharedMemBytes = 0 //!< Shared memory size
);
void submitNativeFn(amd::NativeFnCommand& cmd);
void submitMarker(amd::Marker& cmd);
@@ -226,8 +226,8 @@ void Command::enqueue() {
const Context& Command::context() const { return queue_->context(); }
NDRangeKernelCommand::NDRangeKernelCommand(HostQueue& queue, const EventWaitList& eventWaitList,
Kernel& kernel, const NDRangeContainer& sizes)
: Command(queue, CL_COMMAND_NDRANGE_KERNEL, eventWaitList), kernel_(kernel), sizes_(sizes) {
Kernel& kernel, const NDRangeContainer& sizes, uint32_t sharedMemBytes)
: Command(queue, CL_COMMAND_NDRANGE_KERNEL, eventWaitList), kernel_(kernel), sizes_(sizes), sharedMemBytes_(sharedMemBytes) {
auto& device = queue.device();
auto devKernel = const_cast<device::Kernel*>(kernel.getDeviceKernel(device));
profilingInfo_.setCallback(devKernel->getProfilingCallback(
@@ -752,11 +752,12 @@ class NDRangeKernelCommand : public Command {
Kernel& kernel_;
NDRangeContainer sizes_;
address parameters_;
uint32_t sharedMemBytes_;
public:
//! Construct an ExecuteKernel command
NDRangeKernelCommand(HostQueue& queue, const EventWaitList& eventWaitList, Kernel& kernel,
const NDRangeContainer& sizes);
const NDRangeContainer& sizes, uint32_t sharedMemBytes = 0);
virtual void submit(device::VirtualDevice& device) { device.submitKernel(*this); }
@@ -772,6 +773,9 @@ class NDRangeKernelCommand : public Command {
//! Return the kernel NDRange.
const NDRangeContainer& sizes() const { return sizes_; }
//! Return the shared memory size
uint32_t sharedMemBytes() const { return sharedMemBytes_; }
//! Set the local work size.
void setLocalWorkSize(const NDRange& local) { sizes_.local() = local; }