From 6df9a4943797fc52f5ce9690523e7bded9d478f6 Mon Sep 17 00:00:00 2001 From: "Andryeyev, German" Date: Tue, 12 Aug 2025 19:04:36 -0400 Subject: [PATCH] SWDEV-465041 - Add support for user events with DD (#321) * SWDEV-465041 - Add support for user events with DD User events can be replaced with HSA signals. Add the interface to allocate HSA signal for user events and update the status on CL_COMPLETE. Force pinned path with DD to avoid blocking calls. Pinned memory can be released only when the command is complete. Simplify device enqueue path to use generic kernel arg buffer and signals * Fix notifyCmdQueue() logic for OCL * Avoid blocking calls in OCL with DD * Add event destruciton in a case of the failure. [ROCm/clr commit: 2305f8ae565389cfc348a747c467c004909d7a90] --- projects/clr/opencl/amdocl/cl_event.cpp | 11 +-- projects/clr/rocclr/device/device.hpp | 9 ++ projects/clr/rocclr/device/rocm/rocblit.cpp | 68 ++++++++------- projects/clr/rocclr/device/rocm/rocblit.hpp | 4 +- projects/clr/rocclr/device/rocm/rocdevice.cpp | 19 +++++ projects/clr/rocclr/device/rocm/rocdevice.hpp | 2 + .../clr/rocclr/device/rocm/rocsettings.cpp | 2 + .../clr/rocclr/device/rocm/rocsettings.hpp | 3 +- .../clr/rocclr/device/rocm/rocvirtual.cpp | 85 ++++++------------- .../clr/rocclr/device/rocm/rocvirtual.hpp | 5 +- projects/clr/rocclr/platform/command.cpp | 17 +++- projects/clr/rocclr/platform/command.hpp | 45 +++++++++- projects/clr/rocclr/platform/commandqueue.cpp | 3 + 13 files changed, 168 insertions(+), 105 deletions(-) diff --git a/projects/clr/opencl/amdocl/cl_event.cpp b/projects/clr/opencl/amdocl/cl_event.cpp index d4f2b04e26..4dfc9c0b94 100644 --- a/projects/clr/opencl/amdocl/cl_event.cpp +++ b/projects/clr/opencl/amdocl/cl_event.cpp @@ -251,15 +251,16 @@ RUNTIME_ENTRY_RET(cl_event, clCreateUserEvent, (cl_context context, cl_int* errc return (cl_event)0; } - amd::Event* event = new amd::UserEvent(*as_amd(context)); - if (event == NULL) { + auto event = new amd::UserEvent(*as_amd(context)); + if (event == nullptr || !event->Create()) { + delete event; *not_null(errcode_ret) = CL_OUT_OF_HOST_MEMORY; return (cl_event)0; } event->retain(); *not_null(errcode_ret) = CL_SUCCESS; - return as_cl(event); + return as_cl(reinterpret_cast(event)); } RUNTIME_EXIT @@ -288,8 +289,8 @@ RUNTIME_ENTRY(cl_int, clSetUserEventStatus, (cl_event event, cl_int execution_st if (execution_status > CL_COMPLETE) { return CL_INVALID_VALUE; } - - if (!as_amd(event)->setStatus(execution_status)) { + auto user_event = reinterpret_cast(as_amd(event)); + if (!user_event->SetExecutionStatus(execution_status)) { return CL_INVALID_OPERATION; } return CL_SUCCESS; diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp index e3b3f736df..4ecbd04e56 100644 --- a/projects/clr/rocclr/device/device.hpp +++ b/projects/clr/rocclr/device/device.hpp @@ -95,6 +95,7 @@ class StreamOperationCommand; class BatchMemoryOperationCommand; class VirtualMapCommand; class ExternalSemaphoreCmd; +class UserEvent; class Isa; class Device; struct KernelParameterDescriptor; @@ -1329,6 +1330,7 @@ class VirtualDevice : public amd::HeapObject { ShouldNotReachHere(); } virtual void submitVirtualMap(amd::VirtualMapCommand& cmd) { ShouldNotReachHere(); } + virtual void submitUserEvent(amd::UserEvent& vcmd) { ShouldNotReachHere(); } virtual address allocKernelArguments(size_t size, size_t alignment) { return nullptr; } virtual void ReleaseAllHwQueues() {} @@ -2037,6 +2039,13 @@ class Device : public RuntimeObject { return (info().svmCapabilities_ & CL_DEVICE_SVM_ATOMICS) != 0 ? true : false; } + /// @brief Creates HW user event for OpenCL implementation + /// @return The pointer to a HW event structure, known to the HW backend + virtual bool CreateUserEvent(amd::UserEvent* event) const { return false; } + + /// @brief Sets HW user event to the complete status + virtual void SetUserEvent(amd::UserEvent* event) const {} + //! Returns TRUE if the device is available for computations bool isOnline() const { return online_; } diff --git a/projects/clr/rocclr/device/rocm/rocblit.cpp b/projects/clr/rocclr/device/rocm/rocblit.cpp index 40ff3454ae..3ab0b06549 100644 --- a/projects/clr/rocclr/device/rocm/rocblit.cpp +++ b/projects/clr/rocclr/device/rocm/rocblit.cpp @@ -54,8 +54,8 @@ bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, const amd::Coord3D& origin, const amd::Coord3D& size, bool entire, amd::CopyMetadata copyMetadata) const { // Use host copy if memory has direct access - if (setup_.disableReadBuffer_ || - (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) { + if (dev().settings().blocking_blit_ && (setup_.disableReadBuffer_ || + (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached()))) { // Stall GPU before CPU access gpu().releaseGpuMemoryFence(); return HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire, copyMetadata); @@ -138,8 +138,9 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory, const amd::Coord3D& origin, const amd::Coord3D& size, bool entire, amd::CopyMetadata copyMetadata) const { // Use host copy if memory has direct access - if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() || - gpuMem(dstMemory).IsPersistentDirectMap()) { + if (dev().settings().blocking_blit_ && + (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() || + gpuMem(dstMemory).IsPersistentDirectMap())) { // Stall GPU before CPU access gpu().releaseGpuMemoryFence(); return HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire, copyMetadata); @@ -685,6 +686,7 @@ void DmaBlitManager::getBuffer(const_address hostMem, size_t size, buffState.buffer_ = gpu().Staging().Acquire(std::min(xferSize, StagingXferSize)); } +// ================================================================================================ void DmaBlitManager::releaseBuffer(BufferState &buffer) const { if (buffer.pinnedMem_) { gpu().addPinnedMem(buffer.pinnedMem_); @@ -696,7 +698,7 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs size_t size, bool hostToDev, amd::CopyMetadata& copyMetadata, bool enablePin) const { // Do not skip wait here for D2H. Resolving dependent signals for SDMA engine is slow - gpu().releaseGpuMemoryFence(hostToDev); + gpu().releaseGpuMemoryFence(hostToDev || !dev().settings().blocking_blit_); // If Pinning is enabled, Pin host Memory for copy size > MinSizeForPinnedTransfer // For 16KB < size <= MinSizeForPinnedTransfer Use staging buffer without pinning bool status = true; @@ -740,9 +742,9 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs copyMetadata.isAsync_); const_address src = static_cast(hostSrc) + copyOffset; status = rocrCopyBuffer(stagingBuffer, dstAgent, src , srcAgent, copysize, copyMetadata); - if (status ) { - // Wait for current signal of previous rocr copy if its not pinned mem + if (status) { if (outBuffer.pinnedMem_ == nullptr) { + // Wait for current signal of previous rocr copy if its not pinned mem gpu().Barriers().WaitCurrent(); ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy host dst=%p, stg buf=%p, size=%zu", hostDst + copyOffset, stagingBuffer, copysize); @@ -752,6 +754,7 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs break; } } + // Release Pinned Memory back to pool if any releaseBuffer(outBuffer); // Update Offset and Transfer Size @@ -760,12 +763,18 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs firstTx = false; } + // @note: HIP requires a blocking wait on D2H with the pageable system memory + if (amd::IS_HIP && !hostToDev) { + gpu().Barriers().WaitCurrent(); + } + if(!status) { return false; } return true; } + // ================================================================================================ KernelBlitManager::KernelBlitManager(VirtualGPU& gpu, Setup setup) : DmaBlitManager(gpu, setup), @@ -1718,8 +1727,9 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, bool result = false; // Use host copy if memory has direct access - if (setup_.disableReadBuffer_ || (srcMemory.isHostMemDirectAccess() && - !srcMemory.isCpuUncached())) { + if (dev().settings().blocking_blit_ && + (setup_.disableReadBuffer_ || (srcMemory.isHostMemDirectAccess() && + !srcMemory.isCpuUncached()))) { // Stall GPU before CPU access gpu().releaseGpuMemoryFence(); result = HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire, copyMetadata); @@ -1854,8 +1864,9 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo bool result = false; // Use host copy if memory has direct access - if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() || - gpuMem(dstMemory).IsPersistentDirectMap()) { + if (dev().settings().blocking_blit_ && + (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() || + gpuMem(dstMemory).IsPersistentDirectMap())) { // Stall GPU before CPU access gpu().releaseGpuMemoryFence(); result = HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire, copyMetadata); @@ -2718,10 +2729,9 @@ void KernelBlitManager::releaseArguments(address args) const { } // ================================================================================================ -bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam, +bool KernelBlitManager::runScheduler(uint64_t vqVM, hsa_queue_t* schedulerQueue, - hsa_signal_t& schedulerSignal, - uint threads) { + uint threads, uint64_t aql_wrap) { size_t globalWorkOffset[1] = {0}; size_t globalWorkSize[1] = {threads}; size_t localWorkSize[1] = {1}; @@ -2731,21 +2741,16 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam, device::Kernel* devKernel = const_cast(kernels_[Scheduler]->getDeviceKernel(dev())); Kernel& gpuKernel = static_cast(*devKernel); - SchedulerParam* sp = reinterpret_cast(schedulerParam->getHostMem()); + auto* sp = reinterpret_cast( + gpu().allocKernArg(sizeof(SchedulerParam), kCBAlignment)); memset(sp, 0, sizeof(SchedulerParam)); - Memory* schedulerMem = dev().getRocMemory(schedulerParam); - sp->kernarg_address = reinterpret_cast(schedulerMem->getDeviceMemory()); + sp->kernarg_address = reinterpret_cast(sp); sp->thread_counter = 0; sp->child_queue = reinterpret_cast(schedulerQueue); - sp->complete_signal = schedulerSignal; - - hsa_signal_store_relaxed(schedulerSignal, kInitSignalValueOne); - - + sp->complete_signal = gpu().Barriers().ActiveSignal(kInitSignalValueOne, nullptr); sp->vqueue_header = vqVM; - - sp->parentAQL = sp->kernarg_address + sizeof(SchedulerParam); + sp->parentAQL = reinterpret_cast(aql_wrap); if (dev().info().maxEngineClockFrequency_ > 0) { sp->eng_clk = (1000 * 1024) / dev().info().maxEngineClockFrequency_; @@ -2754,8 +2759,8 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam, // Use a device side global atomics to workaround the reliance of PCIe 3 atomics sp->write_index = hsa_queue_load_write_index_relaxed(schedulerQueue); - cl_mem mem = as_cl(schedulerParam); - setArgument(kernels_[Scheduler], 0, sizeof(cl_mem), &mem); + constexpr bool kDirectVa = true; + setArgument(kernels_[Scheduler], 0, sizeof(cl_mem), sp, 0, nullptr, kDirectVa); address parameters = captureArguments(kernels_[Scheduler]); @@ -2764,12 +2769,17 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam, return false; } releaseArguments(parameters); - - if (!WaitForSignal(schedulerSignal)) { + // Wait for the scheduler to finish all operations + gpu().WaitCompleteSignal(sp->complete_signal); + // @note: A wait shouldn't be really necessary, but the queue write_index may not get a proper + // value without the wait for all previous commands (see the PCIE3 atomics workaround above). + // The scheduler can enqueue extra commands, but the real queue write index didn't have any + // progress. That leads to hangs and requires blocking. Then the wait causes problems in DD mode + // with device enqueue and user events, because device enqueue is blocking below + if (!WaitForSignal(sp->complete_signal)) { LogWarning("Failed schedulerSignal wait"); return false; } - return true; } diff --git a/projects/clr/rocclr/device/rocm/rocblit.hpp b/projects/clr/rocclr/device/rocm/rocblit.hpp index 8a7f03f454..9c4b16d580 100644 --- a/projects/clr/rocclr/device/rocm/rocblit.hpp +++ b/projects/clr/rocclr/device/rocm/rocblit.hpp @@ -492,10 +492,8 @@ class KernelBlitManager : public DmaBlitManager { ) const; bool runScheduler(uint64_t vqVM, - amd::Memory* schedulerParam, hsa_queue_t* schedulerQueue, - hsa_signal_t& schedulerSignal, - uint threads); + uint threads, uint64_t aql_wrap); //! Runs a blit kernel for GWS init bool RunGwsInit(uint32_t value //!< Initial value for GWS resource diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp index 44f3822d33..3274cb22f3 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp @@ -3464,6 +3464,25 @@ void Device::ReleaseGlobalSignal(void* signal) const { } } +// ================================================================================================ +bool Device::CreateUserEvent(amd::UserEvent* event) const { + std::unique_ptr signal(new ProfilingSignal()); + if ((signal == nullptr) || + (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &signal->signal_))) { + return false; + } + hsa_signal_silent_store_relaxed(signal->signal_, kInitSignalValueOne); + event->SetHwEvent(signal.release()); + return true; +} + +// ================================================================================================ +void Device::SetUserEvent(amd::UserEvent* event) const { + auto signal = reinterpret_cast(event->HwEvent()); + assert(signal != nullptr && "Can't have user event without hw event!"); + hsa_signal_silent_store_relaxed(signal->signal_, 0); +} + // ================================================================================================ bool Device::IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer_info_t* ptr_info) { // Query ptr type to see if it's a HMM allocation diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp index 32dc66726c..0c7ce24805 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.hpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp @@ -450,6 +450,8 @@ class Device : public NullDevice { uint32_t hip_event_flags = 0) const; virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const; virtual void ReleaseGlobalSignal(void* signal) const; + virtual bool CreateUserEvent(amd::UserEvent* event) const; + virtual void SetUserEvent(amd::UserEvent* event) const; //! Allocate host memory in terms of numa policy set by user void* hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const; diff --git a/projects/clr/rocclr/device/rocm/rocsettings.cpp b/projects/clr/rocclr/device/rocm/rocsettings.cpp index d53231fdd9..9d89124cdb 100644 --- a/projects/clr/rocclr/device/rocm/rocsettings.cpp +++ b/projects/clr/rocclr/device/rocm/rocsettings.cpp @@ -97,6 +97,8 @@ Settings::Settings() { limit_blit_wg_ = 16; dynamic_queues_ = amd::IS_HIP ? DEBUG_HIP_DYNAMIC_QUEUES : false; + // note: OCL user events don't allow CPU blocking calls in DD mode + blocking_blit_ = amd::IS_HIP || !AMD_DIRECT_DISPATCH; } // ================================================================================================ diff --git a/projects/clr/rocclr/device/rocm/rocsettings.hpp b/projects/clr/rocclr/device/rocm/rocsettings.hpp index dd323b86ab..b6f741db8e 100644 --- a/projects/clr/rocclr/device/rocm/rocsettings.hpp +++ b/projects/clr/rocclr/device/rocm/rocsettings.hpp @@ -51,7 +51,8 @@ class Settings : public device::Settings { uint fgs_kernel_arg_ : 1; //!< Use fine grain kernel arg segment uint barrier_value_packet_ : 1; //!< Barrier value packet functionality uint dynamic_queues_ : 1; //!< Dynamic queues management - uint reserved_ : 22; + uint blocking_blit_ : 1; //!< Blit ops can be blocking on CPU + uint reserved_ : 21; }; uint value_; }; diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index d5247ad82c..91b8e5146e 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -1209,6 +1209,12 @@ bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, return false; } +// ================================================================================================ +void VirtualGPU::WaitCompleteSignal(hsa_signal_t signal) { + barrier_packet_.dep_signal[0] = signal; + dispatchBarrierPacket(kBarrierPacketHeader, false); +} + // ================================================================================================ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal, hsa_signal_t signal) { @@ -1413,9 +1419,7 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative, deviceQueueSize_(0), maskGroups_(0), schedulerThreads_(0), - schedulerParam_(nullptr), schedulerQueue_(nullptr), - schedulerSignal_({0}), barriers_(*this), managed_buffer_(*this, ManagedBuffer::kPoolNumSignals * device.settings().stagedXferSize_), managed_kernarg_buffer_(*this, device.settings().kernargPoolSize_), @@ -1490,18 +1494,10 @@ VirtualGPU::~VirtualGPU() { delete printfdbg_; - if (0 != schedulerSignal_.handle) { - hsa_signal_destroy(schedulerSignal_); - } - if (nullptr != schedulerQueue_) { hsa_queue_destroy(schedulerQueue_); } - if (nullptr != schedulerParam_) { - schedulerParam_->release(); - } - if (nullptr != virtualQueue_) { virtualQueue_->release(); } @@ -2604,7 +2600,7 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& cmd) { // If we have host memory, use it if ((devMemory->owner()->getHostMem() != nullptr) && (devMemory->owner()->getSvmPtr() == nullptr)) { - if (!devMemory->isHostMemDirectAccess()) { + if (!AMD_DIRECT_DISPATCH && !devMemory->isHostMemDirectAccess()) { // Make sure GPU finished operation before synchronization with the backing store releaseGpuMemoryFence(); } @@ -3081,18 +3077,11 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) { // ================================================================================================ bool VirtualGPU::createSchedulerParam() { - if (nullptr != schedulerParam_) { + if (nullptr != schedulerQueue_) { return true; } - while(true) { - schedulerParam_ = new (dev().context()) amd::Buffer(dev().context(), - CL_MEM_ALLOC_HOST_PTR, sizeof(SchedulerParam) + sizeof(AmdAqlWrap)); - - if ((nullptr != schedulerParam_) && !schedulerParam_->create(nullptr)) { - break; - } - + while (true) { // The queue is written by multiple threads of the scheduler kernel if (HSA_STATUS_SUCCESS != hsa_queue_create(gpu_device(), 2048, HSA_QUEUE_TYPE_MULTI, callbackQueue, &roc_device_, std::numeric_limits::max(), @@ -3100,39 +3089,14 @@ bool VirtualGPU::createSchedulerParam() break; } - hsa_signal_t signal0 = {0}; - - if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &signal0)) { - break; - } - - schedulerSignal_ = signal0; - - Memory* schedulerMem = dev().getRocMemory(schedulerParam_); - - if (nullptr == schedulerMem) { - break; - } - - schedulerParam_->setVirtualDevice(this); return true; } - if (0 != schedulerSignal_.handle) { - hsa_signal_destroy(schedulerSignal_); - schedulerSignal_.handle = 0; - } - if (nullptr != schedulerQueue_) { hsa_queue_destroy(schedulerQueue_); schedulerQueue_ = nullptr; } - if (nullptr != schedulerParam_) { - schedulerParam_->release(); - schedulerParam_ = nullptr; - } - return false; } @@ -3355,7 +3319,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, global[i] = static_cast(sizes.global()[i]); local[i] = static_cast(local_size[i]); } - + uint64_t spVA = 0; // Check if runtime has to setup hidden arguments for (uint32_t i = signature.numParameters(); i < signature.numParametersAll(); ++i) { const auto& it = signature.at(i); @@ -3415,14 +3379,12 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, break; } case amd::KernelParameterDescriptor::HiddenCompletionAction: { - uint64_t spVA = 0; - if (nullptr != schedulerParam_ && devKernel->dynamicParallelism()) { - Memory* schedulerMem = dev().getRocMemory(schedulerParam_); - AmdAqlWrap* wrap = reinterpret_cast( - reinterpret_cast(schedulerParam_->getHostMem()) + sizeof(SchedulerParam)); + if (devKernel->dynamicParallelism()) { + auto params = allocKernArg(sizeof(AmdAqlWrap), 64); + AmdAqlWrap* wrap = reinterpret_cast(params); memset(wrap, 0, sizeof(AmdAqlWrap)); wrap->state = AQL_WRAP_DONE; - spVA = reinterpret_cast(schedulerMem->getDeviceMemory()) + sizeof(SchedulerParam); + spVA = reinterpret_cast(wrap); } WriteAqlArgAt(hidden_arguments, spVA, it.size_, it.offset_); break; @@ -3651,8 +3613,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, dispatchBarrierPacket(kBarrierPacketHeader, true); if (virtualQueue_ != nullptr) { static_cast(blitMgr()).runScheduler( - getVQVirtualAddress(), schedulerParam_, schedulerQueue_, - schedulerSignal_, schedulerThreads_); + getVQVirtualAddress(), schedulerQueue_, + schedulerThreads_, spVA); } } @@ -3833,10 +3795,10 @@ void VirtualGPU::flush(amd::Command* list, bool wait) { // ================================================================================================ void VirtualGPU::addPinnedMem(amd::Memory* mem) { - //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait - //! unconditionally, before it can release pinned memory - releaseGpuMemoryFence(); if (!AMD_DIRECT_DISPATCH) { + //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait + //! unconditionally, before it can release pinned memory + releaseGpuMemoryFence(); if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) { if (pinnedMems_.size() > 7) { pinnedMems_.front()->release(); @@ -3847,7 +3809,14 @@ void VirtualGPU::addPinnedMem(amd::Memory* mem) { pinnedMems_.push_back(mem); } } else { - mem->release(); + if (command_ != nullptr) { + command_->AddPinnedMemory(mem); + } else { + //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait + //! unconditionally, before it can release pinned memory + releaseGpuMemoryFence(); + mem->release(); + } } } diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp index 1d9c8b756b..1c54f653c3 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp @@ -450,8 +450,9 @@ class VirtualGPU : public device::VirtualDevice { void* allocKernArg(size_t size, size_t alignment); bool isFenceDirty() const { return fence_dirty_; } void setFenceDirty(bool state) { fence_dirty_ = state; } - void HiddenHeapInit(); + void WaitCompleteSignal(hsa_signal_t signal); + void HiddenHeapInit(); void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; } uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); } uint64_t getQueueID(); @@ -592,9 +593,7 @@ class VirtualGPU : public device::VirtualDevice { //!< one thread uint schedulerThreads_; //!< The number of scheduler threads - amd::Memory* schedulerParam_; hsa_queue_t* schedulerQueue_; - hsa_signal_t schedulerSignal_; HwQueueTracker barriers_; //!< Tracks active barriers in ROCr diff --git a/projects/clr/rocclr/platform/command.cpp b/projects/clr/rocclr/platform/command.cpp index 6a20fe9e26..d9361703e7 100644 --- a/projects/clr/rocclr/platform/command.cpp +++ b/projects/clr/rocclr/platform/command.cpp @@ -269,7 +269,9 @@ bool Event::notifyCmdQueue(bool cpu_wait) { ScopedLock l(notify_lock_); if ((status() > CL_COMPLETE) && (nullptr != queue) && // If HW event was assigned, then notification can be ignored, since a barrier was issued - (HwEvent() == nullptr) && + // @note: Force the marker always in OCL for now, since OCL events require precise + // sequence of the status update + ((HwEvent() == nullptr) || !amd::IS_HIP) && !notified_.test_and_set()) { // Make sure the queue is draining the enqueued commands. amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this, cpu_wait); @@ -364,8 +366,17 @@ void Command::enqueue() { // Notify all commands about the waiter. Barrier will be sent in order to obtain // HSA signal for a wait on the current queue - for (const auto& event : eventWaitList()) { - event->notifyCmdQueue(!kCpuWait); + for (const auto &event: eventWaitList()) { + if (!amd::IS_HIP && event->command().type() == CL_COMMAND_USER) { + if (event->status() >= CL_COMPLETE) { + reinterpret_cast(event)->AddDependent(this); + } else { + setStatus(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST); + return; + } + } else { + event->notifyCmdQueue(!kCpuWait); + } } // The batch update must be lock protected to avoid a race condition diff --git a/projects/clr/rocclr/platform/command.hpp b/projects/clr/rocclr/platform/command.hpp index 3dbc814c54..977014f69d 100644 --- a/projects/clr/rocclr/platform/command.hpp +++ b/projects/clr/rocclr/platform/command.hpp @@ -397,6 +397,9 @@ class Command : public Event { //! Release the resources associated with this event. virtual void releaseResources(); + //! Empty function for adding pinned memory + virtual void AddPinnedMemory(Memory* pinned) {} + //! Set the next GPU command void setNext(Command* next) { next_ = next; } @@ -424,16 +427,46 @@ class Command : public Event { }; class UserEvent : public Command { - const Context& context_; + const Context& context_; //!< OCL context associated with the event + std::vector dependents_; //!< Commands, which depends on this user event public: UserEvent(Context& context) : Command(CL_COMMAND_USER), context_(context) { setStatus(CL_SUBMITTED); } + //! Creates a user event in the backend layer + bool Create() { + if (AMD_DIRECT_DISPATCH) { + return context_.devices()[0]->CreateUserEvent(this); + } else { + return true; + } + } + + //! Sets the execution status of the user event + bool SetExecutionStatus(cl_int status) { + if (AMD_DIRECT_DISPATCH) { + // If it's invalid status, then mark dependent commands as invalid + if (status < CL_COMPLETE) { + for (auto it : dependents_) { + it->setStatus(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST); + } + } + dependents_.clear(); + context_.devices()[0]->SetUserEvent(this); + } + return setStatus(status); + } + + //! Adds dependent commands for the user event + void AddDependent(Command* command) { + dependents_.push_back(command); + } + virtual void submit(device::VirtualDevice& device) { ShouldNotCallThis(); } - virtual const Context& context() const { return context_; } + const Context& context() const { return context_; } }; class ClGlEvent : public Command { @@ -450,7 +483,7 @@ class ClGlEvent : public Command { bool awaitCompletion() { return waitForFence(); } - virtual const Context& context() const { return context_; } + const Context& context() const { return context_; } }; inline Command& Event::command() { return *static_cast(this); } @@ -465,6 +498,7 @@ class NDRangeContainer; class OneMemoryArgCommand : public Command { protected: Memory* memory_; + std::vector pinned_memory_; //!< Pinned memory object public: OneMemoryArgCommand(HostQueue& queue, cl_command_type type, const EventWaitList& eventWaitList, @@ -477,8 +511,13 @@ class OneMemoryArgCommand : public Command { memory_->release(); DEBUG_ONLY(memory_ = NULL); Command::releaseResources(); + for (auto it : pinned_memory_) { + it->release(); + } } + //! Adds pinned memory, used in this command for later release + virtual void AddPinnedMemory(Memory* pinned) override { pinned_memory_.push_back(pinned); } bool validateMemory(); bool validatePeerMemory(); }; diff --git a/projects/clr/rocclr/platform/commandqueue.cpp b/projects/clr/rocclr/platform/commandqueue.cpp index 665e5313f8..6e8d5a96ad 100644 --- a/projects/clr/rocclr/platform/commandqueue.cpp +++ b/projects/clr/rocclr/platform/commandqueue.cpp @@ -173,6 +173,9 @@ void HostQueue::finish(bool cpu_wait) { (vdev()->QueuedAsyncHandlers().load() > DEBUG_HIP_BLOCK_SYNC)) { cpu_wait = true; } + } else { + // Force CPU wait for OpenCL, since the tests may check OCL command status after finish + cpu_wait = true; } size_t batchSize = GetSubmissionBatchSize();