diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp index b0a23a45fe..359331cc85 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp @@ -2100,7 +2100,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) { profilingBegin(vcmd); // Submit kernel to HW - if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, &vcmd.event())) { + if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, &vcmd.event(), vcmd.sharedMemBytes())) { vcmd.setStatus(CL_INVALID_OPERATION); } @@ -2110,7 +2110,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) { // ================================================================================================ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel, const_address parameters, bool nativeMem, - amd::Event* enqueueEvent) + amd::Event* enqueueEvent, uint32_t sharedMemBytes) { size_t newOffset[3] = { 0, 0, 0 }; size_t newGlobalSize[3] = { 0, 0, 0 }; @@ -2232,7 +2232,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const uint64_t vmParentWrap = 0; // Program the kernel arguments for the GPU execution hsa_kernel_dispatch_packet_t* aqlPkt = hsaKernel.loadArguments( - *this, kernel, tmpSizes, parameters, ldsSize, vmDefQueue, &vmParentWrap); + *this, kernel, tmpSizes, parameters, ldsSize + sharedMemBytes, vmDefQueue, &vmParentWrap); if (nullptr == aqlPkt) { LogError("Couldn't load kernel arguments"); return false; diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp index a7a6b38ee7..aff4163677 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp @@ -299,7 +299,8 @@ class VirtualGPU : public device::VirtualDevice { const amd::Kernel& kernel, //!< Kernel for execution const_address parameters, //!< Parameters for the kernel bool nativeMem = true, //!< Native memory objects - amd::Event* enqueueEvent = nullptr //!< Event provided in the enqueue kernel command + amd::Event* enqueueEvent = nullptr, //!< Event provided in the enqueue kernel command + uint32_t sharedMemBytes = 0 //!< Shared memory size ); void submitNativeFn(amd::NativeFnCommand& vcmd); void submitFillMemory(amd::FillMemoryCommand& vcmd); diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp index 38cbde14c9..1c6567adfa 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp @@ -1938,7 +1938,7 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) } bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel, - const_address parameters, void* eventHandle) { + const_address parameters, void* eventHandle, uint32_t sharedMemBytes) { device::Kernel* devKernel = const_cast(kernel.getDeviceKernel(dev())); Kernel& gpuKernel = static_cast(*devKernel); size_t ldsUsage = gpuKernel.WorkgroupGroupSegmentByteSize(); @@ -2113,7 +2113,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const dispatchPacket.workgroup_size_z = sizes.dimensions() > 2 ? local[2] : 1; dispatchPacket.kernarg_address = argBuffer; - dispatchPacket.group_segment_size = ldsUsage; + dispatchPacket.group_segment_size = ldsUsage + sharedMemBytes; dispatchPacket.private_segment_size = devKernel->workGroupInfo()->privateMemSize_; // Dispatch the packet @@ -2154,7 +2154,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) { // Submit kernel to HW if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), - static_cast(as_cl(&vcmd.event())))) { + static_cast(as_cl(&vcmd.event())), vcmd.sharedMemBytes())) { LogError("AQL dispatch failed!"); vcmd.setStatus(CL_INVALID_OPERATION); } diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp index 0e9e18981b..7b7d9290b2 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp @@ -166,7 +166,8 @@ class VirtualGPU : public device::VirtualDevice { bool submitKernelInternal(const amd::NDRangeContainer& sizes, //!< Workload sizes const amd::Kernel& kernel, //!< Kernel for execution const_address parameters, //!< Parameters for the kernel - void* event_handle //!< Handle to OCL event for debugging + void* event_handle, //!< Handle to OCL event for debugging + uint32_t sharedMemBytes = 0 //!< Shared memory size ); void submitNativeFn(amd::NativeFnCommand& cmd); void submitMarker(amd::Marker& cmd); diff --git a/projects/clr/rocclr/runtime/platform/command.cpp b/projects/clr/rocclr/runtime/platform/command.cpp index 99873ac981..19556b9939 100644 --- a/projects/clr/rocclr/runtime/platform/command.cpp +++ b/projects/clr/rocclr/runtime/platform/command.cpp @@ -226,8 +226,8 @@ void Command::enqueue() { const Context& Command::context() const { return queue_->context(); } NDRangeKernelCommand::NDRangeKernelCommand(HostQueue& queue, const EventWaitList& eventWaitList, - Kernel& kernel, const NDRangeContainer& sizes) - : Command(queue, CL_COMMAND_NDRANGE_KERNEL, eventWaitList), kernel_(kernel), sizes_(sizes) { + Kernel& kernel, const NDRangeContainer& sizes, uint32_t sharedMemBytes) + : Command(queue, CL_COMMAND_NDRANGE_KERNEL, eventWaitList), kernel_(kernel), sizes_(sizes), sharedMemBytes_(sharedMemBytes) { auto& device = queue.device(); auto devKernel = const_cast(kernel.getDeviceKernel(device)); profilingInfo_.setCallback(devKernel->getProfilingCallback( diff --git a/projects/clr/rocclr/runtime/platform/command.hpp b/projects/clr/rocclr/runtime/platform/command.hpp index e979112ada..bad260289e 100644 --- a/projects/clr/rocclr/runtime/platform/command.hpp +++ b/projects/clr/rocclr/runtime/platform/command.hpp @@ -752,11 +752,12 @@ class NDRangeKernelCommand : public Command { Kernel& kernel_; NDRangeContainer sizes_; address parameters_; + uint32_t sharedMemBytes_; public: //! Construct an ExecuteKernel command NDRangeKernelCommand(HostQueue& queue, const EventWaitList& eventWaitList, Kernel& kernel, - const NDRangeContainer& sizes); + const NDRangeContainer& sizes, uint32_t sharedMemBytes = 0); virtual void submit(device::VirtualDevice& device) { device.submitKernel(*this); } @@ -772,6 +773,9 @@ class NDRangeKernelCommand : public Command { //! Return the kernel NDRange. const NDRangeContainer& sizes() const { return sizes_; } + //! Return the shared memory size + uint32_t sharedMemBytes() const { return sharedMemBytes_; } + //! Set the local work size. void setLocalWorkSize(const NDRange& local) { sizes_.local() = local; }