P4 to Git Change 1746912 by cpaquot@cpaquot-ocl-lc-lnx on 2019/02/21 14:16:35
SWDEV-178453 - [HIP] Add extra parameter for sharedMemBytes
Affected files ...
... //depot/stg/opencl/drivers/opencl/api/hip/hip_module.cpp#20 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#129 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#58 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#72 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#21 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/command.cpp#91 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/command.hpp#91 edit
[ROCm/clr commit: a343072c71]
This commit is contained in:
@@ -2100,7 +2100,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
|
||||
profilingBegin(vcmd);
|
||||
|
||||
// Submit kernel to HW
|
||||
if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, &vcmd.event())) {
|
||||
if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, &vcmd.event(), vcmd.sharedMemBytes())) {
|
||||
vcmd.setStatus(CL_INVALID_OPERATION);
|
||||
}
|
||||
|
||||
@@ -2110,7 +2110,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
|
||||
// ================================================================================================
|
||||
bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel,
|
||||
const_address parameters, bool nativeMem,
|
||||
amd::Event* enqueueEvent)
|
||||
amd::Event* enqueueEvent, uint32_t sharedMemBytes)
|
||||
{
|
||||
size_t newOffset[3] = { 0, 0, 0 };
|
||||
size_t newGlobalSize[3] = { 0, 0, 0 };
|
||||
@@ -2232,7 +2232,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
uint64_t vmParentWrap = 0;
|
||||
// Program the kernel arguments for the GPU execution
|
||||
hsa_kernel_dispatch_packet_t* aqlPkt = hsaKernel.loadArguments(
|
||||
*this, kernel, tmpSizes, parameters, ldsSize, vmDefQueue, &vmParentWrap);
|
||||
*this, kernel, tmpSizes, parameters, ldsSize + sharedMemBytes, vmDefQueue, &vmParentWrap);
|
||||
if (nullptr == aqlPkt) {
|
||||
LogError("Couldn't load kernel arguments");
|
||||
return false;
|
||||
|
||||
@@ -299,7 +299,8 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
const amd::Kernel& kernel, //!< Kernel for execution
|
||||
const_address parameters, //!< Parameters for the kernel
|
||||
bool nativeMem = true, //!< Native memory objects
|
||||
amd::Event* enqueueEvent = nullptr //!< Event provided in the enqueue kernel command
|
||||
amd::Event* enqueueEvent = nullptr, //!< Event provided in the enqueue kernel command
|
||||
uint32_t sharedMemBytes = 0 //!< Shared memory size
|
||||
);
|
||||
void submitNativeFn(amd::NativeFnCommand& vcmd);
|
||||
void submitFillMemory(amd::FillMemoryCommand& vcmd);
|
||||
|
||||
@@ -1938,7 +1938,7 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize)
|
||||
}
|
||||
|
||||
bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel,
|
||||
const_address parameters, void* eventHandle) {
|
||||
const_address parameters, void* eventHandle, uint32_t sharedMemBytes) {
|
||||
device::Kernel* devKernel = const_cast<device::Kernel*>(kernel.getDeviceKernel(dev()));
|
||||
Kernel& gpuKernel = static_cast<Kernel&>(*devKernel);
|
||||
size_t ldsUsage = gpuKernel.WorkgroupGroupSegmentByteSize();
|
||||
@@ -2113,7 +2113,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
dispatchPacket.workgroup_size_z = sizes.dimensions() > 2 ? local[2] : 1;
|
||||
|
||||
dispatchPacket.kernarg_address = argBuffer;
|
||||
dispatchPacket.group_segment_size = ldsUsage;
|
||||
dispatchPacket.group_segment_size = ldsUsage + sharedMemBytes;
|
||||
dispatchPacket.private_segment_size = devKernel->workGroupInfo()->privateMemSize_;
|
||||
|
||||
// Dispatch the packet
|
||||
@@ -2154,7 +2154,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
|
||||
|
||||
// Submit kernel to HW
|
||||
if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(),
|
||||
static_cast<void*>(as_cl(&vcmd.event())))) {
|
||||
static_cast<void*>(as_cl(&vcmd.event())), vcmd.sharedMemBytes())) {
|
||||
LogError("AQL dispatch failed!");
|
||||
vcmd.setStatus(CL_INVALID_OPERATION);
|
||||
}
|
||||
|
||||
@@ -166,7 +166,8 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
bool submitKernelInternal(const amd::NDRangeContainer& sizes, //!< Workload sizes
|
||||
const amd::Kernel& kernel, //!< Kernel for execution
|
||||
const_address parameters, //!< Parameters for the kernel
|
||||
void* event_handle //!< Handle to OCL event for debugging
|
||||
void* event_handle, //!< Handle to OCL event for debugging
|
||||
uint32_t sharedMemBytes = 0 //!< Shared memory size
|
||||
);
|
||||
void submitNativeFn(amd::NativeFnCommand& cmd);
|
||||
void submitMarker(amd::Marker& cmd);
|
||||
|
||||
@@ -226,8 +226,8 @@ void Command::enqueue() {
|
||||
const Context& Command::context() const { return queue_->context(); }
|
||||
|
||||
NDRangeKernelCommand::NDRangeKernelCommand(HostQueue& queue, const EventWaitList& eventWaitList,
|
||||
Kernel& kernel, const NDRangeContainer& sizes)
|
||||
: Command(queue, CL_COMMAND_NDRANGE_KERNEL, eventWaitList), kernel_(kernel), sizes_(sizes) {
|
||||
Kernel& kernel, const NDRangeContainer& sizes, uint32_t sharedMemBytes)
|
||||
: Command(queue, CL_COMMAND_NDRANGE_KERNEL, eventWaitList), kernel_(kernel), sizes_(sizes), sharedMemBytes_(sharedMemBytes) {
|
||||
auto& device = queue.device();
|
||||
auto devKernel = const_cast<device::Kernel*>(kernel.getDeviceKernel(device));
|
||||
profilingInfo_.setCallback(devKernel->getProfilingCallback(
|
||||
|
||||
@@ -752,11 +752,12 @@ class NDRangeKernelCommand : public Command {
|
||||
Kernel& kernel_;
|
||||
NDRangeContainer sizes_;
|
||||
address parameters_;
|
||||
uint32_t sharedMemBytes_;
|
||||
|
||||
public:
|
||||
//! Construct an ExecuteKernel command
|
||||
NDRangeKernelCommand(HostQueue& queue, const EventWaitList& eventWaitList, Kernel& kernel,
|
||||
const NDRangeContainer& sizes);
|
||||
const NDRangeContainer& sizes, uint32_t sharedMemBytes = 0);
|
||||
|
||||
virtual void submit(device::VirtualDevice& device) { device.submitKernel(*this); }
|
||||
|
||||
@@ -772,6 +773,9 @@ class NDRangeKernelCommand : public Command {
|
||||
//! Return the kernel NDRange.
|
||||
const NDRangeContainer& sizes() const { return sizes_; }
|
||||
|
||||
//! Return the shared memory size
|
||||
uint32_t sharedMemBytes() const { return sharedMemBytes_; }
|
||||
|
||||
//! Set the local work size.
|
||||
void setLocalWorkSize(const NDRange& local) { sizes_.local() = local; }
|
||||
|
||||
|
||||
Reference in New Issue
Block a user