diff --git a/opencl/amdocl/cl_event.cpp b/opencl/amdocl/cl_event.cpp index d4f2b04e26..4dfc9c0b94 100644 --- a/opencl/amdocl/cl_event.cpp +++ b/opencl/amdocl/cl_event.cpp @@ -251,15 +251,16 @@ RUNTIME_ENTRY_RET(cl_event, clCreateUserEvent, (cl_context context, cl_int* errc return (cl_event)0; } - amd::Event* event = new amd::UserEvent(*as_amd(context)); - if (event == NULL) { + auto event = new amd::UserEvent(*as_amd(context)); + if (event == nullptr || !event->Create()) { + delete event; *not_null(errcode_ret) = CL_OUT_OF_HOST_MEMORY; return (cl_event)0; } event->retain(); *not_null(errcode_ret) = CL_SUCCESS; - return as_cl(event); + return as_cl(reinterpret_cast(event)); } RUNTIME_EXIT @@ -288,8 +289,8 @@ RUNTIME_ENTRY(cl_int, clSetUserEventStatus, (cl_event event, cl_int execution_st if (execution_status > CL_COMPLETE) { return CL_INVALID_VALUE; } - - if (!as_amd(event)->setStatus(execution_status)) { + auto user_event = reinterpret_cast(as_amd(event)); + if (!user_event->SetExecutionStatus(execution_status)) { return CL_INVALID_OPERATION; } return CL_SUCCESS; diff --git a/rocclr/device/device.hpp b/rocclr/device/device.hpp index e3b3f736df..4ecbd04e56 100644 --- a/rocclr/device/device.hpp +++ b/rocclr/device/device.hpp @@ -95,6 +95,7 @@ class StreamOperationCommand; class BatchMemoryOperationCommand; class VirtualMapCommand; class ExternalSemaphoreCmd; +class UserEvent; class Isa; class Device; struct KernelParameterDescriptor; @@ -1329,6 +1330,7 @@ class VirtualDevice : public amd::HeapObject { ShouldNotReachHere(); } virtual void submitVirtualMap(amd::VirtualMapCommand& cmd) { ShouldNotReachHere(); } + virtual void submitUserEvent(amd::UserEvent& vcmd) { ShouldNotReachHere(); } virtual address allocKernelArguments(size_t size, size_t alignment) { return nullptr; } virtual void ReleaseAllHwQueues() {} @@ -2037,6 +2039,13 @@ class Device : public RuntimeObject { return (info().svmCapabilities_ & CL_DEVICE_SVM_ATOMICS) != 0 ? true : false; } + /// @brief Creates HW user event for OpenCL implementation + /// @return The pointer to a HW event structure, known to the HW backend + virtual bool CreateUserEvent(amd::UserEvent* event) const { return false; } + + /// @brief Sets HW user event to the complete status + virtual void SetUserEvent(amd::UserEvent* event) const {} + //! Returns TRUE if the device is available for computations bool isOnline() const { return online_; } diff --git a/rocclr/device/rocm/rocblit.cpp b/rocclr/device/rocm/rocblit.cpp index 40ff3454ae..3ab0b06549 100644 --- a/rocclr/device/rocm/rocblit.cpp +++ b/rocclr/device/rocm/rocblit.cpp @@ -54,8 +54,8 @@ bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, const amd::Coord3D& origin, const amd::Coord3D& size, bool entire, amd::CopyMetadata copyMetadata) const { // Use host copy if memory has direct access - if (setup_.disableReadBuffer_ || - (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) { + if (dev().settings().blocking_blit_ && (setup_.disableReadBuffer_ || + (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached()))) { // Stall GPU before CPU access gpu().releaseGpuMemoryFence(); return HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire, copyMetadata); @@ -138,8 +138,9 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory, const amd::Coord3D& origin, const amd::Coord3D& size, bool entire, amd::CopyMetadata copyMetadata) const { // Use host copy if memory has direct access - if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() || - gpuMem(dstMemory).IsPersistentDirectMap()) { + if (dev().settings().blocking_blit_ && + (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() || + gpuMem(dstMemory).IsPersistentDirectMap())) { // Stall GPU before CPU access gpu().releaseGpuMemoryFence(); return HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire, copyMetadata); @@ -685,6 +686,7 @@ void DmaBlitManager::getBuffer(const_address hostMem, size_t size, buffState.buffer_ = gpu().Staging().Acquire(std::min(xferSize, StagingXferSize)); } +// ================================================================================================ void DmaBlitManager::releaseBuffer(BufferState &buffer) const { if (buffer.pinnedMem_) { gpu().addPinnedMem(buffer.pinnedMem_); @@ -696,7 +698,7 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs size_t size, bool hostToDev, amd::CopyMetadata& copyMetadata, bool enablePin) const { // Do not skip wait here for D2H. Resolving dependent signals for SDMA engine is slow - gpu().releaseGpuMemoryFence(hostToDev); + gpu().releaseGpuMemoryFence(hostToDev || !dev().settings().blocking_blit_); // If Pinning is enabled, Pin host Memory for copy size > MinSizeForPinnedTransfer // For 16KB < size <= MinSizeForPinnedTransfer Use staging buffer without pinning bool status = true; @@ -740,9 +742,9 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs copyMetadata.isAsync_); const_address src = static_cast(hostSrc) + copyOffset; status = rocrCopyBuffer(stagingBuffer, dstAgent, src , srcAgent, copysize, copyMetadata); - if (status ) { - // Wait for current signal of previous rocr copy if its not pinned mem + if (status) { if (outBuffer.pinnedMem_ == nullptr) { + // Wait for current signal of previous rocr copy if its not pinned mem gpu().Barriers().WaitCurrent(); ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy host dst=%p, stg buf=%p, size=%zu", hostDst + copyOffset, stagingBuffer, copysize); @@ -752,6 +754,7 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs break; } } + // Release Pinned Memory back to pool if any releaseBuffer(outBuffer); // Update Offset and Transfer Size @@ -760,12 +763,18 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs firstTx = false; } + // @note: HIP requires a blocking wait on D2H with the pageable system memory + if (amd::IS_HIP && !hostToDev) { + gpu().Barriers().WaitCurrent(); + } + if(!status) { return false; } return true; } + // ================================================================================================ KernelBlitManager::KernelBlitManager(VirtualGPU& gpu, Setup setup) : DmaBlitManager(gpu, setup), @@ -1718,8 +1727,9 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, bool result = false; // Use host copy if memory has direct access - if (setup_.disableReadBuffer_ || (srcMemory.isHostMemDirectAccess() && - !srcMemory.isCpuUncached())) { + if (dev().settings().blocking_blit_ && + (setup_.disableReadBuffer_ || (srcMemory.isHostMemDirectAccess() && + !srcMemory.isCpuUncached()))) { // Stall GPU before CPU access gpu().releaseGpuMemoryFence(); result = HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire, copyMetadata); @@ -1854,8 +1864,9 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo bool result = false; // Use host copy if memory has direct access - if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() || - gpuMem(dstMemory).IsPersistentDirectMap()) { + if (dev().settings().blocking_blit_ && + (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() || + gpuMem(dstMemory).IsPersistentDirectMap())) { // Stall GPU before CPU access gpu().releaseGpuMemoryFence(); result = HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire, copyMetadata); @@ -2718,10 +2729,9 @@ void KernelBlitManager::releaseArguments(address args) const { } // ================================================================================================ -bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam, +bool KernelBlitManager::runScheduler(uint64_t vqVM, hsa_queue_t* schedulerQueue, - hsa_signal_t& schedulerSignal, - uint threads) { + uint threads, uint64_t aql_wrap) { size_t globalWorkOffset[1] = {0}; size_t globalWorkSize[1] = {threads}; size_t localWorkSize[1] = {1}; @@ -2731,21 +2741,16 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam, device::Kernel* devKernel = const_cast(kernels_[Scheduler]->getDeviceKernel(dev())); Kernel& gpuKernel = static_cast(*devKernel); - SchedulerParam* sp = reinterpret_cast(schedulerParam->getHostMem()); + auto* sp = reinterpret_cast( + gpu().allocKernArg(sizeof(SchedulerParam), kCBAlignment)); memset(sp, 0, sizeof(SchedulerParam)); - Memory* schedulerMem = dev().getRocMemory(schedulerParam); - sp->kernarg_address = reinterpret_cast(schedulerMem->getDeviceMemory()); + sp->kernarg_address = reinterpret_cast(sp); sp->thread_counter = 0; sp->child_queue = reinterpret_cast(schedulerQueue); - sp->complete_signal = schedulerSignal; - - hsa_signal_store_relaxed(schedulerSignal, kInitSignalValueOne); - - + sp->complete_signal = gpu().Barriers().ActiveSignal(kInitSignalValueOne, nullptr); sp->vqueue_header = vqVM; - - sp->parentAQL = sp->kernarg_address + sizeof(SchedulerParam); + sp->parentAQL = reinterpret_cast(aql_wrap); if (dev().info().maxEngineClockFrequency_ > 0) { sp->eng_clk = (1000 * 1024) / dev().info().maxEngineClockFrequency_; @@ -2754,8 +2759,8 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam, // Use a device side global atomics to workaround the reliance of PCIe 3 atomics sp->write_index = hsa_queue_load_write_index_relaxed(schedulerQueue); - cl_mem mem = as_cl(schedulerParam); - setArgument(kernels_[Scheduler], 0, sizeof(cl_mem), &mem); + constexpr bool kDirectVa = true; + setArgument(kernels_[Scheduler], 0, sizeof(cl_mem), sp, 0, nullptr, kDirectVa); address parameters = captureArguments(kernels_[Scheduler]); @@ -2764,12 +2769,17 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam, return false; } releaseArguments(parameters); - - if (!WaitForSignal(schedulerSignal)) { + // Wait for the scheduler to finish all operations + gpu().WaitCompleteSignal(sp->complete_signal); + // @note: A wait shouldn't be really necessary, but the queue write_index may not get a proper + // value without the wait for all previous commands (see the PCIE3 atomics workaround above). + // The scheduler can enqueue extra commands, but the real queue write index didn't have any + // progress. That leads to hangs and requires blocking. Then the wait causes problems in DD mode + // with device enqueue and user events, because device enqueue is blocking below + if (!WaitForSignal(sp->complete_signal)) { LogWarning("Failed schedulerSignal wait"); return false; } - return true; } diff --git a/rocclr/device/rocm/rocblit.hpp b/rocclr/device/rocm/rocblit.hpp index 8a7f03f454..9c4b16d580 100644 --- a/rocclr/device/rocm/rocblit.hpp +++ b/rocclr/device/rocm/rocblit.hpp @@ -492,10 +492,8 @@ class KernelBlitManager : public DmaBlitManager { ) const; bool runScheduler(uint64_t vqVM, - amd::Memory* schedulerParam, hsa_queue_t* schedulerQueue, - hsa_signal_t& schedulerSignal, - uint threads); + uint threads, uint64_t aql_wrap); //! Runs a blit kernel for GWS init bool RunGwsInit(uint32_t value //!< Initial value for GWS resource diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp index 44f3822d33..3274cb22f3 100644 --- a/rocclr/device/rocm/rocdevice.cpp +++ b/rocclr/device/rocm/rocdevice.cpp @@ -3464,6 +3464,25 @@ void Device::ReleaseGlobalSignal(void* signal) const { } } +// ================================================================================================ +bool Device::CreateUserEvent(amd::UserEvent* event) const { + std::unique_ptr signal(new ProfilingSignal()); + if ((signal == nullptr) || + (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &signal->signal_))) { + return false; + } + hsa_signal_silent_store_relaxed(signal->signal_, kInitSignalValueOne); + event->SetHwEvent(signal.release()); + return true; +} + +// ================================================================================================ +void Device::SetUserEvent(amd::UserEvent* event) const { + auto signal = reinterpret_cast(event->HwEvent()); + assert(signal != nullptr && "Can't have user event without hw event!"); + hsa_signal_silent_store_relaxed(signal->signal_, 0); +} + // ================================================================================================ bool Device::IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer_info_t* ptr_info) { // Query ptr type to see if it's a HMM allocation diff --git a/rocclr/device/rocm/rocdevice.hpp b/rocclr/device/rocm/rocdevice.hpp index 32dc66726c..0c7ce24805 100644 --- a/rocclr/device/rocm/rocdevice.hpp +++ b/rocclr/device/rocm/rocdevice.hpp @@ -450,6 +450,8 @@ class Device : public NullDevice { uint32_t hip_event_flags = 0) const; virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const; virtual void ReleaseGlobalSignal(void* signal) const; + virtual bool CreateUserEvent(amd::UserEvent* event) const; + virtual void SetUserEvent(amd::UserEvent* event) const; //! Allocate host memory in terms of numa policy set by user void* hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const; diff --git a/rocclr/device/rocm/rocsettings.cpp b/rocclr/device/rocm/rocsettings.cpp index d53231fdd9..9d89124cdb 100644 --- a/rocclr/device/rocm/rocsettings.cpp +++ b/rocclr/device/rocm/rocsettings.cpp @@ -97,6 +97,8 @@ Settings::Settings() { limit_blit_wg_ = 16; dynamic_queues_ = amd::IS_HIP ? DEBUG_HIP_DYNAMIC_QUEUES : false; + // note: OCL user events don't allow CPU blocking calls in DD mode + blocking_blit_ = amd::IS_HIP || !AMD_DIRECT_DISPATCH; } // ================================================================================================ diff --git a/rocclr/device/rocm/rocsettings.hpp b/rocclr/device/rocm/rocsettings.hpp index dd323b86ab..b6f741db8e 100644 --- a/rocclr/device/rocm/rocsettings.hpp +++ b/rocclr/device/rocm/rocsettings.hpp @@ -51,7 +51,8 @@ class Settings : public device::Settings { uint fgs_kernel_arg_ : 1; //!< Use fine grain kernel arg segment uint barrier_value_packet_ : 1; //!< Barrier value packet functionality uint dynamic_queues_ : 1; //!< Dynamic queues management - uint reserved_ : 22; + uint blocking_blit_ : 1; //!< Blit ops can be blocking on CPU + uint reserved_ : 21; }; uint value_; }; diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp index d5247ad82c..91b8e5146e 100644 --- a/rocclr/device/rocm/rocvirtual.cpp +++ b/rocclr/device/rocm/rocvirtual.cpp @@ -1209,6 +1209,12 @@ bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, return false; } +// ================================================================================================ +void VirtualGPU::WaitCompleteSignal(hsa_signal_t signal) { + barrier_packet_.dep_signal[0] = signal; + dispatchBarrierPacket(kBarrierPacketHeader, false); +} + // ================================================================================================ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal, hsa_signal_t signal) { @@ -1413,9 +1419,7 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative, deviceQueueSize_(0), maskGroups_(0), schedulerThreads_(0), - schedulerParam_(nullptr), schedulerQueue_(nullptr), - schedulerSignal_({0}), barriers_(*this), managed_buffer_(*this, ManagedBuffer::kPoolNumSignals * device.settings().stagedXferSize_), managed_kernarg_buffer_(*this, device.settings().kernargPoolSize_), @@ -1490,18 +1494,10 @@ VirtualGPU::~VirtualGPU() { delete printfdbg_; - if (0 != schedulerSignal_.handle) { - hsa_signal_destroy(schedulerSignal_); - } - if (nullptr != schedulerQueue_) { hsa_queue_destroy(schedulerQueue_); } - if (nullptr != schedulerParam_) { - schedulerParam_->release(); - } - if (nullptr != virtualQueue_) { virtualQueue_->release(); } @@ -2604,7 +2600,7 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& cmd) { // If we have host memory, use it if ((devMemory->owner()->getHostMem() != nullptr) && (devMemory->owner()->getSvmPtr() == nullptr)) { - if (!devMemory->isHostMemDirectAccess()) { + if (!AMD_DIRECT_DISPATCH && !devMemory->isHostMemDirectAccess()) { // Make sure GPU finished operation before synchronization with the backing store releaseGpuMemoryFence(); } @@ -3081,18 +3077,11 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) { // ================================================================================================ bool VirtualGPU::createSchedulerParam() { - if (nullptr != schedulerParam_) { + if (nullptr != schedulerQueue_) { return true; } - while(true) { - schedulerParam_ = new (dev().context()) amd::Buffer(dev().context(), - CL_MEM_ALLOC_HOST_PTR, sizeof(SchedulerParam) + sizeof(AmdAqlWrap)); - - if ((nullptr != schedulerParam_) && !schedulerParam_->create(nullptr)) { - break; - } - + while (true) { // The queue is written by multiple threads of the scheduler kernel if (HSA_STATUS_SUCCESS != hsa_queue_create(gpu_device(), 2048, HSA_QUEUE_TYPE_MULTI, callbackQueue, &roc_device_, std::numeric_limits::max(), @@ -3100,39 +3089,14 @@ bool VirtualGPU::createSchedulerParam() break; } - hsa_signal_t signal0 = {0}; - - if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &signal0)) { - break; - } - - schedulerSignal_ = signal0; - - Memory* schedulerMem = dev().getRocMemory(schedulerParam_); - - if (nullptr == schedulerMem) { - break; - } - - schedulerParam_->setVirtualDevice(this); return true; } - if (0 != schedulerSignal_.handle) { - hsa_signal_destroy(schedulerSignal_); - schedulerSignal_.handle = 0; - } - if (nullptr != schedulerQueue_) { hsa_queue_destroy(schedulerQueue_); schedulerQueue_ = nullptr; } - if (nullptr != schedulerParam_) { - schedulerParam_->release(); - schedulerParam_ = nullptr; - } - return false; } @@ -3355,7 +3319,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, global[i] = static_cast(sizes.global()[i]); local[i] = static_cast(local_size[i]); } - + uint64_t spVA = 0; // Check if runtime has to setup hidden arguments for (uint32_t i = signature.numParameters(); i < signature.numParametersAll(); ++i) { const auto& it = signature.at(i); @@ -3415,14 +3379,12 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, break; } case amd::KernelParameterDescriptor::HiddenCompletionAction: { - uint64_t spVA = 0; - if (nullptr != schedulerParam_ && devKernel->dynamicParallelism()) { - Memory* schedulerMem = dev().getRocMemory(schedulerParam_); - AmdAqlWrap* wrap = reinterpret_cast( - reinterpret_cast(schedulerParam_->getHostMem()) + sizeof(SchedulerParam)); + if (devKernel->dynamicParallelism()) { + auto params = allocKernArg(sizeof(AmdAqlWrap), 64); + AmdAqlWrap* wrap = reinterpret_cast(params); memset(wrap, 0, sizeof(AmdAqlWrap)); wrap->state = AQL_WRAP_DONE; - spVA = reinterpret_cast(schedulerMem->getDeviceMemory()) + sizeof(SchedulerParam); + spVA = reinterpret_cast(wrap); } WriteAqlArgAt(hidden_arguments, spVA, it.size_, it.offset_); break; @@ -3651,8 +3613,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, dispatchBarrierPacket(kBarrierPacketHeader, true); if (virtualQueue_ != nullptr) { static_cast(blitMgr()).runScheduler( - getVQVirtualAddress(), schedulerParam_, schedulerQueue_, - schedulerSignal_, schedulerThreads_); + getVQVirtualAddress(), schedulerQueue_, + schedulerThreads_, spVA); } } @@ -3833,10 +3795,10 @@ void VirtualGPU::flush(amd::Command* list, bool wait) { // ================================================================================================ void VirtualGPU::addPinnedMem(amd::Memory* mem) { - //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait - //! unconditionally, before it can release pinned memory - releaseGpuMemoryFence(); if (!AMD_DIRECT_DISPATCH) { + //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait + //! unconditionally, before it can release pinned memory + releaseGpuMemoryFence(); if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) { if (pinnedMems_.size() > 7) { pinnedMems_.front()->release(); @@ -3847,7 +3809,14 @@ void VirtualGPU::addPinnedMem(amd::Memory* mem) { pinnedMems_.push_back(mem); } } else { - mem->release(); + if (command_ != nullptr) { + command_->AddPinnedMemory(mem); + } else { + //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait + //! unconditionally, before it can release pinned memory + releaseGpuMemoryFence(); + mem->release(); + } } } diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp index 1d9c8b756b..1c54f653c3 100644 --- a/rocclr/device/rocm/rocvirtual.hpp +++ b/rocclr/device/rocm/rocvirtual.hpp @@ -450,8 +450,9 @@ class VirtualGPU : public device::VirtualDevice { void* allocKernArg(size_t size, size_t alignment); bool isFenceDirty() const { return fence_dirty_; } void setFenceDirty(bool state) { fence_dirty_ = state; } - void HiddenHeapInit(); + void WaitCompleteSignal(hsa_signal_t signal); + void HiddenHeapInit(); void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; } uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); } uint64_t getQueueID(); @@ -592,9 +593,7 @@ class VirtualGPU : public device::VirtualDevice { //!< one thread uint schedulerThreads_; //!< The number of scheduler threads - amd::Memory* schedulerParam_; hsa_queue_t* schedulerQueue_; - hsa_signal_t schedulerSignal_; HwQueueTracker barriers_; //!< Tracks active barriers in ROCr diff --git a/rocclr/platform/command.cpp b/rocclr/platform/command.cpp index 6a20fe9e26..d9361703e7 100644 --- a/rocclr/platform/command.cpp +++ b/rocclr/platform/command.cpp @@ -269,7 +269,9 @@ bool Event::notifyCmdQueue(bool cpu_wait) { ScopedLock l(notify_lock_); if ((status() > CL_COMPLETE) && (nullptr != queue) && // If HW event was assigned, then notification can be ignored, since a barrier was issued - (HwEvent() == nullptr) && + // @note: Force the marker always in OCL for now, since OCL events require precise + // sequence of the status update + ((HwEvent() == nullptr) || !amd::IS_HIP) && !notified_.test_and_set()) { // Make sure the queue is draining the enqueued commands. amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this, cpu_wait); @@ -364,8 +366,17 @@ void Command::enqueue() { // Notify all commands about the waiter. Barrier will be sent in order to obtain // HSA signal for a wait on the current queue - for (const auto& event : eventWaitList()) { - event->notifyCmdQueue(!kCpuWait); + for (const auto &event: eventWaitList()) { + if (!amd::IS_HIP && event->command().type() == CL_COMMAND_USER) { + if (event->status() >= CL_COMPLETE) { + reinterpret_cast(event)->AddDependent(this); + } else { + setStatus(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST); + return; + } + } else { + event->notifyCmdQueue(!kCpuWait); + } } // The batch update must be lock protected to avoid a race condition diff --git a/rocclr/platform/command.hpp b/rocclr/platform/command.hpp index 3dbc814c54..977014f69d 100644 --- a/rocclr/platform/command.hpp +++ b/rocclr/platform/command.hpp @@ -397,6 +397,9 @@ class Command : public Event { //! Release the resources associated with this event. virtual void releaseResources(); + //! Empty function for adding pinned memory + virtual void AddPinnedMemory(Memory* pinned) {} + //! Set the next GPU command void setNext(Command* next) { next_ = next; } @@ -424,16 +427,46 @@ class Command : public Event { }; class UserEvent : public Command { - const Context& context_; + const Context& context_; //!< OCL context associated with the event + std::vector dependents_; //!< Commands, which depends on this user event public: UserEvent(Context& context) : Command(CL_COMMAND_USER), context_(context) { setStatus(CL_SUBMITTED); } + //! Creates a user event in the backend layer + bool Create() { + if (AMD_DIRECT_DISPATCH) { + return context_.devices()[0]->CreateUserEvent(this); + } else { + return true; + } + } + + //! Sets the execution status of the user event + bool SetExecutionStatus(cl_int status) { + if (AMD_DIRECT_DISPATCH) { + // If it's invalid status, then mark dependent commands as invalid + if (status < CL_COMPLETE) { + for (auto it : dependents_) { + it->setStatus(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST); + } + } + dependents_.clear(); + context_.devices()[0]->SetUserEvent(this); + } + return setStatus(status); + } + + //! Adds dependent commands for the user event + void AddDependent(Command* command) { + dependents_.push_back(command); + } + virtual void submit(device::VirtualDevice& device) { ShouldNotCallThis(); } - virtual const Context& context() const { return context_; } + const Context& context() const { return context_; } }; class ClGlEvent : public Command { @@ -450,7 +483,7 @@ class ClGlEvent : public Command { bool awaitCompletion() { return waitForFence(); } - virtual const Context& context() const { return context_; } + const Context& context() const { return context_; } }; inline Command& Event::command() { return *static_cast(this); } @@ -465,6 +498,7 @@ class NDRangeContainer; class OneMemoryArgCommand : public Command { protected: Memory* memory_; + std::vector pinned_memory_; //!< Pinned memory object public: OneMemoryArgCommand(HostQueue& queue, cl_command_type type, const EventWaitList& eventWaitList, @@ -477,8 +511,13 @@ class OneMemoryArgCommand : public Command { memory_->release(); DEBUG_ONLY(memory_ = NULL); Command::releaseResources(); + for (auto it : pinned_memory_) { + it->release(); + } } + //! Adds pinned memory, used in this command for later release + virtual void AddPinnedMemory(Memory* pinned) override { pinned_memory_.push_back(pinned); } bool validateMemory(); bool validatePeerMemory(); }; diff --git a/rocclr/platform/commandqueue.cpp b/rocclr/platform/commandqueue.cpp index 665e5313f8..6e8d5a96ad 100644 --- a/rocclr/platform/commandqueue.cpp +++ b/rocclr/platform/commandqueue.cpp @@ -173,6 +173,9 @@ void HostQueue::finish(bool cpu_wait) { (vdev()->QueuedAsyncHandlers().load() > DEBUG_HIP_BLOCK_SYNC)) { cpu_wait = true; } + } else { + // Force CPU wait for OpenCL, since the tests may check OCL command status after finish + cpu_wait = true; } size_t batchSize = GetSubmissionBatchSize();