SWDEV-465041 - Add support for user events with DD (#321)

* SWDEV-465041 - Add support for user events with DD

User events can be replaced with HSA signals. Add the interface
to allocate HSA signal for user events and update the status on
CL_COMPLETE.
Force pinned path with DD to avoid blocking calls. Pinned memory
can be released only when the command is complete.
Simplify device enqueue path to use generic kernel arg buffer and
signals

* Fix notifyCmdQueue() logic for OCL

* Avoid blocking calls in OCL with DD

* Add event  destruciton in a case of the failure.

[ROCm/clr commit: 2305f8ae56]
Этот коммит содержится в:
Andryeyev, German
2025-08-12 19:04:36 -04:00
коммит произвёл GitHub
родитель a7ed228997
Коммит 6df9a49437
13 изменённых файлов: 168 добавлений и 105 удалений
+6 -5
Просмотреть файл
@@ -251,15 +251,16 @@ RUNTIME_ENTRY_RET(cl_event, clCreateUserEvent, (cl_context context, cl_int* errc
return (cl_event)0;
}
amd::Event* event = new amd::UserEvent(*as_amd(context));
if (event == NULL) {
auto event = new amd::UserEvent(*as_amd(context));
if (event == nullptr || !event->Create()) {
delete event;
*not_null(errcode_ret) = CL_OUT_OF_HOST_MEMORY;
return (cl_event)0;
}
event->retain();
*not_null(errcode_ret) = CL_SUCCESS;
return as_cl(event);
return as_cl(reinterpret_cast<amd::Event*>(event));
}
RUNTIME_EXIT
@@ -288,8 +289,8 @@ RUNTIME_ENTRY(cl_int, clSetUserEventStatus, (cl_event event, cl_int execution_st
if (execution_status > CL_COMPLETE) {
return CL_INVALID_VALUE;
}
if (!as_amd(event)->setStatus(execution_status)) {
auto user_event = reinterpret_cast<amd::UserEvent*>(as_amd(event));
if (!user_event->SetExecutionStatus(execution_status)) {
return CL_INVALID_OPERATION;
}
return CL_SUCCESS;
+9
Просмотреть файл
@@ -95,6 +95,7 @@ class StreamOperationCommand;
class BatchMemoryOperationCommand;
class VirtualMapCommand;
class ExternalSemaphoreCmd;
class UserEvent;
class Isa;
class Device;
struct KernelParameterDescriptor;
@@ -1329,6 +1330,7 @@ class VirtualDevice : public amd::HeapObject {
ShouldNotReachHere();
}
virtual void submitVirtualMap(amd::VirtualMapCommand& cmd) { ShouldNotReachHere(); }
virtual void submitUserEvent(amd::UserEvent& vcmd) { ShouldNotReachHere(); }
virtual address allocKernelArguments(size_t size, size_t alignment) { return nullptr; }
virtual void ReleaseAllHwQueues() {}
@@ -2037,6 +2039,13 @@ class Device : public RuntimeObject {
return (info().svmCapabilities_ & CL_DEVICE_SVM_ATOMICS) != 0 ? true : false;
}
/// @brief Creates HW user event for OpenCL implementation
/// @return The pointer to a HW event structure, known to the HW backend
virtual bool CreateUserEvent(amd::UserEvent* event) const { return false; }
/// @brief Sets HW user event to the complete status
virtual void SetUserEvent(amd::UserEvent* event) const {}
//! Returns TRUE if the device is available for computations
bool isOnline() const { return online_; }
+39 -29
Просмотреть файл
@@ -54,8 +54,8 @@ bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
const amd::Coord3D& origin, const amd::Coord3D& size,
bool entire, amd::CopyMetadata copyMetadata) const {
// Use host copy if memory has direct access
if (setup_.disableReadBuffer_ ||
(srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
if (dev().settings().blocking_blit_ && (setup_.disableReadBuffer_ ||
(srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached()))) {
// Stall GPU before CPU access
gpu().releaseGpuMemoryFence();
return HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire, copyMetadata);
@@ -138,8 +138,9 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
const amd::Coord3D& origin, const amd::Coord3D& size,
bool entire, amd::CopyMetadata copyMetadata) const {
// Use host copy if memory has direct access
if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
gpuMem(dstMemory).IsPersistentDirectMap()) {
if (dev().settings().blocking_blit_ &&
(setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
gpuMem(dstMemory).IsPersistentDirectMap())) {
// Stall GPU before CPU access
gpu().releaseGpuMemoryFence();
return HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire, copyMetadata);
@@ -685,6 +686,7 @@ void DmaBlitManager::getBuffer(const_address hostMem, size_t size,
buffState.buffer_ = gpu().Staging().Acquire(std::min(xferSize, StagingXferSize));
}
// ================================================================================================
void DmaBlitManager::releaseBuffer(BufferState &buffer) const {
if (buffer.pinnedMem_) {
gpu().addPinnedMem(buffer.pinnedMem_);
@@ -696,7 +698,7 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs
size_t size, bool hostToDev, amd::CopyMetadata& copyMetadata,
bool enablePin) const {
// Do not skip wait here for D2H. Resolving dependent signals for SDMA engine is slow
gpu().releaseGpuMemoryFence(hostToDev);
gpu().releaseGpuMemoryFence(hostToDev || !dev().settings().blocking_blit_);
// If Pinning is enabled, Pin host Memory for copy size > MinSizeForPinnedTransfer
// For 16KB < size <= MinSizeForPinnedTransfer Use staging buffer without pinning
bool status = true;
@@ -740,9 +742,9 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs
copyMetadata.isAsync_);
const_address src = static_cast<const_address>(hostSrc) + copyOffset;
status = rocrCopyBuffer(stagingBuffer, dstAgent, src , srcAgent, copysize, copyMetadata);
if (status ) {
// Wait for current signal of previous rocr copy if its not pinned mem
if (status) {
if (outBuffer.pinnedMem_ == nullptr) {
// Wait for current signal of previous rocr copy if its not pinned mem
gpu().Barriers().WaitCurrent();
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy host dst=%p, stg buf=%p, size=%zu",
hostDst + copyOffset, stagingBuffer, copysize);
@@ -752,6 +754,7 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs
break;
}
}
// Release Pinned Memory back to pool if any
releaseBuffer(outBuffer);
// Update Offset and Transfer Size
@@ -760,12 +763,18 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs
firstTx = false;
}
// @note: HIP requires a blocking wait on D2H with the pageable system memory
if (amd::IS_HIP && !hostToDev) {
gpu().Barriers().WaitCurrent();
}
if(!status) {
return false;
}
return true;
}
// ================================================================================================
KernelBlitManager::KernelBlitManager(VirtualGPU& gpu, Setup setup)
: DmaBlitManager(gpu, setup),
@@ -1718,8 +1727,9 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
bool result = false;
// Use host copy if memory has direct access
if (setup_.disableReadBuffer_ || (srcMemory.isHostMemDirectAccess() &&
!srcMemory.isCpuUncached())) {
if (dev().settings().blocking_blit_ &&
(setup_.disableReadBuffer_ || (srcMemory.isHostMemDirectAccess() &&
!srcMemory.isCpuUncached()))) {
// Stall GPU before CPU access
gpu().releaseGpuMemoryFence();
result = HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire, copyMetadata);
@@ -1854,8 +1864,9 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
bool result = false;
// Use host copy if memory has direct access
if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
gpuMem(dstMemory).IsPersistentDirectMap()) {
if (dev().settings().blocking_blit_ &&
(setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
gpuMem(dstMemory).IsPersistentDirectMap())) {
// Stall GPU before CPU access
gpu().releaseGpuMemoryFence();
result = HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire, copyMetadata);
@@ -2718,10 +2729,9 @@ void KernelBlitManager::releaseArguments(address args) const {
}
// ================================================================================================
bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
bool KernelBlitManager::runScheduler(uint64_t vqVM,
hsa_queue_t* schedulerQueue,
hsa_signal_t& schedulerSignal,
uint threads) {
uint threads, uint64_t aql_wrap) {
size_t globalWorkOffset[1] = {0};
size_t globalWorkSize[1] = {threads};
size_t localWorkSize[1] = {1};
@@ -2731,21 +2741,16 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
device::Kernel* devKernel = const_cast<device::Kernel*>(kernels_[Scheduler]->getDeviceKernel(dev()));
Kernel& gpuKernel = static_cast<Kernel&>(*devKernel);
SchedulerParam* sp = reinterpret_cast<SchedulerParam*>(schedulerParam->getHostMem());
auto* sp = reinterpret_cast<SchedulerParam*>(
gpu().allocKernArg(sizeof(SchedulerParam), kCBAlignment));
memset(sp, 0, sizeof(SchedulerParam));
Memory* schedulerMem = dev().getRocMemory(schedulerParam);
sp->kernarg_address = reinterpret_cast<uint64_t>(schedulerMem->getDeviceMemory());
sp->kernarg_address = reinterpret_cast<uint64_t>(sp);
sp->thread_counter = 0;
sp->child_queue = reinterpret_cast<uint64_t>(schedulerQueue);
sp->complete_signal = schedulerSignal;
hsa_signal_store_relaxed(schedulerSignal, kInitSignalValueOne);
sp->complete_signal = gpu().Barriers().ActiveSignal(kInitSignalValueOne, nullptr);
sp->vqueue_header = vqVM;
sp->parentAQL = sp->kernarg_address + sizeof(SchedulerParam);
sp->parentAQL = reinterpret_cast<uint64_t>(aql_wrap);
if (dev().info().maxEngineClockFrequency_ > 0) {
sp->eng_clk = (1000 * 1024) / dev().info().maxEngineClockFrequency_;
@@ -2754,8 +2759,8 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
// Use a device side global atomics to workaround the reliance of PCIe 3 atomics
sp->write_index = hsa_queue_load_write_index_relaxed(schedulerQueue);
cl_mem mem = as_cl<amd::Memory>(schedulerParam);
setArgument(kernels_[Scheduler], 0, sizeof(cl_mem), &mem);
constexpr bool kDirectVa = true;
setArgument(kernels_[Scheduler], 0, sizeof(cl_mem), sp, 0, nullptr, kDirectVa);
address parameters = captureArguments(kernels_[Scheduler]);
@@ -2764,12 +2769,17 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
return false;
}
releaseArguments(parameters);
if (!WaitForSignal(schedulerSignal)) {
// Wait for the scheduler to finish all operations
gpu().WaitCompleteSignal(sp->complete_signal);
// @note: A wait shouldn't be really necessary, but the queue write_index may not get a proper
// value without the wait for all previous commands (see the PCIE3 atomics workaround above).
// The scheduler can enqueue extra commands, but the real queue write index didn't have any
// progress. That leads to hangs and requires blocking. Then the wait causes problems in DD mode
// with device enqueue and user events, because device enqueue is blocking below
if (!WaitForSignal(sp->complete_signal)) {
LogWarning("Failed schedulerSignal wait");
return false;
}
return true;
}
+1 -3
Просмотреть файл
@@ -492,10 +492,8 @@ class KernelBlitManager : public DmaBlitManager {
) const;
bool runScheduler(uint64_t vqVM,
amd::Memory* schedulerParam,
hsa_queue_t* schedulerQueue,
hsa_signal_t& schedulerSignal,
uint threads);
uint threads, uint64_t aql_wrap);
//! Runs a blit kernel for GWS init
bool RunGwsInit(uint32_t value //!< Initial value for GWS resource
+19
Просмотреть файл
@@ -3464,6 +3464,25 @@ void Device::ReleaseGlobalSignal(void* signal) const {
}
}
// ================================================================================================
bool Device::CreateUserEvent(amd::UserEvent* event) const {
std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
if ((signal == nullptr) ||
(HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &signal->signal_))) {
return false;
}
hsa_signal_silent_store_relaxed(signal->signal_, kInitSignalValueOne);
event->SetHwEvent(signal.release());
return true;
}
// ================================================================================================
void Device::SetUserEvent(amd::UserEvent* event) const {
auto signal = reinterpret_cast<ProfilingSignal*>(event->HwEvent());
assert(signal != nullptr && "Can't have user event without hw event!");
hsa_signal_silent_store_relaxed(signal->signal_, 0);
}
// ================================================================================================
bool Device::IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer_info_t* ptr_info) {
// Query ptr type to see if it's a HMM allocation
+2
Просмотреть файл
@@ -450,6 +450,8 @@ class Device : public NullDevice {
uint32_t hip_event_flags = 0) const;
virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const;
virtual void ReleaseGlobalSignal(void* signal) const;
virtual bool CreateUserEvent(amd::UserEvent* event) const;
virtual void SetUserEvent(amd::UserEvent* event) const;
//! Allocate host memory in terms of numa policy set by user
void* hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const;
+2
Просмотреть файл
@@ -97,6 +97,8 @@ Settings::Settings() {
limit_blit_wg_ = 16;
dynamic_queues_ = amd::IS_HIP ? DEBUG_HIP_DYNAMIC_QUEUES : false;
// note: OCL user events don't allow CPU blocking calls in DD mode
blocking_blit_ = amd::IS_HIP || !AMD_DIRECT_DISPATCH;
}
// ================================================================================================
+2 -1
Просмотреть файл
@@ -51,7 +51,8 @@ class Settings : public device::Settings {
uint fgs_kernel_arg_ : 1; //!< Use fine grain kernel arg segment
uint barrier_value_packet_ : 1; //!< Barrier value packet functionality
uint dynamic_queues_ : 1; //!< Dynamic queues management
uint reserved_ : 22;
uint blocking_blit_ : 1; //!< Blit ops can be blocking on CPU
uint reserved_ : 21;
};
uint value_;
};
+27 -58
Просмотреть файл
@@ -1209,6 +1209,12 @@ bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet,
return false;
}
// ================================================================================================
void VirtualGPU::WaitCompleteSignal(hsa_signal_t signal) {
barrier_packet_.dep_signal[0] = signal;
dispatchBarrierPacket(kBarrierPacketHeader, false);
}
// ================================================================================================
void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
hsa_signal_t signal) {
@@ -1413,9 +1419,7 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
deviceQueueSize_(0),
maskGroups_(0),
schedulerThreads_(0),
schedulerParam_(nullptr),
schedulerQueue_(nullptr),
schedulerSignal_({0}),
barriers_(*this),
managed_buffer_(*this, ManagedBuffer::kPoolNumSignals * device.settings().stagedXferSize_),
managed_kernarg_buffer_(*this, device.settings().kernargPoolSize_),
@@ -1490,18 +1494,10 @@ VirtualGPU::~VirtualGPU() {
delete printfdbg_;
if (0 != schedulerSignal_.handle) {
hsa_signal_destroy(schedulerSignal_);
}
if (nullptr != schedulerQueue_) {
hsa_queue_destroy(schedulerQueue_);
}
if (nullptr != schedulerParam_) {
schedulerParam_->release();
}
if (nullptr != virtualQueue_) {
virtualQueue_->release();
}
@@ -2604,7 +2600,7 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& cmd) {
// If we have host memory, use it
if ((devMemory->owner()->getHostMem() != nullptr) &&
(devMemory->owner()->getSvmPtr() == nullptr)) {
if (!devMemory->isHostMemDirectAccess()) {
if (!AMD_DIRECT_DISPATCH && !devMemory->isHostMemDirectAccess()) {
// Make sure GPU finished operation before synchronization with the backing store
releaseGpuMemoryFence();
}
@@ -3081,18 +3077,11 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) {
// ================================================================================================
bool VirtualGPU::createSchedulerParam()
{
if (nullptr != schedulerParam_) {
if (nullptr != schedulerQueue_) {
return true;
}
while(true) {
schedulerParam_ = new (dev().context()) amd::Buffer(dev().context(),
CL_MEM_ALLOC_HOST_PTR, sizeof(SchedulerParam) + sizeof(AmdAqlWrap));
if ((nullptr != schedulerParam_) && !schedulerParam_->create(nullptr)) {
break;
}
while (true) {
// The queue is written by multiple threads of the scheduler kernel
if (HSA_STATUS_SUCCESS != hsa_queue_create(gpu_device(), 2048, HSA_QUEUE_TYPE_MULTI,
callbackQueue, &roc_device_, std::numeric_limits<uint>::max(),
@@ -3100,39 +3089,14 @@ bool VirtualGPU::createSchedulerParam()
break;
}
hsa_signal_t signal0 = {0};
if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &signal0)) {
break;
}
schedulerSignal_ = signal0;
Memory* schedulerMem = dev().getRocMemory(schedulerParam_);
if (nullptr == schedulerMem) {
break;
}
schedulerParam_->setVirtualDevice(this);
return true;
}
if (0 != schedulerSignal_.handle) {
hsa_signal_destroy(schedulerSignal_);
schedulerSignal_.handle = 0;
}
if (nullptr != schedulerQueue_) {
hsa_queue_destroy(schedulerQueue_);
schedulerQueue_ = nullptr;
}
if (nullptr != schedulerParam_) {
schedulerParam_->release();
schedulerParam_ = nullptr;
}
return false;
}
@@ -3355,7 +3319,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
global[i] = static_cast<uint32_t>(sizes.global()[i]);
local[i] = static_cast<uint16_t>(local_size[i]);
}
uint64_t spVA = 0;
// Check if runtime has to setup hidden arguments
for (uint32_t i = signature.numParameters(); i < signature.numParametersAll(); ++i) {
const auto& it = signature.at(i);
@@ -3415,14 +3379,12 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
break;
}
case amd::KernelParameterDescriptor::HiddenCompletionAction: {
uint64_t spVA = 0;
if (nullptr != schedulerParam_ && devKernel->dynamicParallelism()) {
Memory* schedulerMem = dev().getRocMemory(schedulerParam_);
AmdAqlWrap* wrap = reinterpret_cast<AmdAqlWrap*>(
reinterpret_cast<uint64_t>(schedulerParam_->getHostMem()) + sizeof(SchedulerParam));
if (devKernel->dynamicParallelism()) {
auto params = allocKernArg(sizeof(AmdAqlWrap), 64);
AmdAqlWrap* wrap = reinterpret_cast<AmdAqlWrap*>(params);
memset(wrap, 0, sizeof(AmdAqlWrap));
wrap->state = AQL_WRAP_DONE;
spVA = reinterpret_cast<uint64_t>(schedulerMem->getDeviceMemory()) + sizeof(SchedulerParam);
spVA = reinterpret_cast<uint64_t>(wrap);
}
WriteAqlArgAt(hidden_arguments, spVA, it.size_, it.offset_);
break;
@@ -3651,8 +3613,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
dispatchBarrierPacket(kBarrierPacketHeader, true);
if (virtualQueue_ != nullptr) {
static_cast<KernelBlitManager&>(blitMgr()).runScheduler(
getVQVirtualAddress(), schedulerParam_, schedulerQueue_,
schedulerSignal_, schedulerThreads_);
getVQVirtualAddress(), schedulerQueue_,
schedulerThreads_, spVA);
}
}
@@ -3833,10 +3795,10 @@ void VirtualGPU::flush(amd::Command* list, bool wait) {
// ================================================================================================
void VirtualGPU::addPinnedMem(amd::Memory* mem) {
//! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
//! unconditionally, before it can release pinned memory
releaseGpuMemoryFence();
if (!AMD_DIRECT_DISPATCH) {
//! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
//! unconditionally, before it can release pinned memory
releaseGpuMemoryFence();
if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) {
if (pinnedMems_.size() > 7) {
pinnedMems_.front()->release();
@@ -3847,7 +3809,14 @@ void VirtualGPU::addPinnedMem(amd::Memory* mem) {
pinnedMems_.push_back(mem);
}
} else {
mem->release();
if (command_ != nullptr) {
command_->AddPinnedMemory(mem);
} else {
//! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
//! unconditionally, before it can release pinned memory
releaseGpuMemoryFence();
mem->release();
}
}
}
+2 -3
Просмотреть файл
@@ -450,8 +450,9 @@ class VirtualGPU : public device::VirtualDevice {
void* allocKernArg(size_t size, size_t alignment);
bool isFenceDirty() const { return fence_dirty_; }
void setFenceDirty(bool state) { fence_dirty_ = state; }
void HiddenHeapInit();
void WaitCompleteSignal(hsa_signal_t signal);
void HiddenHeapInit();
void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; }
uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); }
uint64_t getQueueID();
@@ -592,9 +593,7 @@ class VirtualGPU : public device::VirtualDevice {
//!< one thread
uint schedulerThreads_; //!< The number of scheduler threads
amd::Memory* schedulerParam_;
hsa_queue_t* schedulerQueue_;
hsa_signal_t schedulerSignal_;
HwQueueTracker barriers_; //!< Tracks active barriers in ROCr
+14 -3
Просмотреть файл
@@ -269,7 +269,9 @@ bool Event::notifyCmdQueue(bool cpu_wait) {
ScopedLock l(notify_lock_);
if ((status() > CL_COMPLETE) && (nullptr != queue) &&
// If HW event was assigned, then notification can be ignored, since a barrier was issued
(HwEvent() == nullptr) &&
// @note: Force the marker always in OCL for now, since OCL events require precise
// sequence of the status update
((HwEvent() == nullptr) || !amd::IS_HIP) &&
!notified_.test_and_set()) {
// Make sure the queue is draining the enqueued commands.
amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this, cpu_wait);
@@ -364,8 +366,17 @@ void Command::enqueue() {
// Notify all commands about the waiter. Barrier will be sent in order to obtain
// HSA signal for a wait on the current queue
for (const auto& event : eventWaitList()) {
event->notifyCmdQueue(!kCpuWait);
for (const auto &event: eventWaitList()) {
if (!amd::IS_HIP && event->command().type() == CL_COMMAND_USER) {
if (event->status() >= CL_COMPLETE) {
reinterpret_cast<amd::UserEvent*>(event)->AddDependent(this);
} else {
setStatus(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
return;
}
} else {
event->notifyCmdQueue(!kCpuWait);
}
}
// The batch update must be lock protected to avoid a race condition
+42 -3
Просмотреть файл
@@ -397,6 +397,9 @@ class Command : public Event {
//! Release the resources associated with this event.
virtual void releaseResources();
//! Empty function for adding pinned memory
virtual void AddPinnedMemory(Memory* pinned) {}
//! Set the next GPU command
void setNext(Command* next) { next_ = next; }
@@ -424,16 +427,46 @@ class Command : public Event {
};
class UserEvent : public Command {
const Context& context_;
const Context& context_; //!< OCL context associated with the event
std::vector<Command*> dependents_; //!< Commands, which depends on this user event
public:
UserEvent(Context& context) : Command(CL_COMMAND_USER), context_(context) {
setStatus(CL_SUBMITTED);
}
//! Creates a user event in the backend layer
bool Create() {
if (AMD_DIRECT_DISPATCH) {
return context_.devices()[0]->CreateUserEvent(this);
} else {
return true;
}
}
//! Sets the execution status of the user event
bool SetExecutionStatus(cl_int status) {
if (AMD_DIRECT_DISPATCH) {
// If it's invalid status, then mark dependent commands as invalid
if (status < CL_COMPLETE) {
for (auto it : dependents_) {
it->setStatus(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
}
}
dependents_.clear();
context_.devices()[0]->SetUserEvent(this);
}
return setStatus(status);
}
//! Adds dependent commands for the user event
void AddDependent(Command* command) {
dependents_.push_back(command);
}
virtual void submit(device::VirtualDevice& device) { ShouldNotCallThis(); }
virtual const Context& context() const { return context_; }
const Context& context() const { return context_; }
};
class ClGlEvent : public Command {
@@ -450,7 +483,7 @@ class ClGlEvent : public Command {
bool awaitCompletion() { return waitForFence(); }
virtual const Context& context() const { return context_; }
const Context& context() const { return context_; }
};
inline Command& Event::command() { return *static_cast<Command*>(this); }
@@ -465,6 +498,7 @@ class NDRangeContainer;
class OneMemoryArgCommand : public Command {
protected:
Memory* memory_;
std::vector<Memory*> pinned_memory_; //!< Pinned memory object
public:
OneMemoryArgCommand(HostQueue& queue, cl_command_type type, const EventWaitList& eventWaitList,
@@ -477,8 +511,13 @@ class OneMemoryArgCommand : public Command {
memory_->release();
DEBUG_ONLY(memory_ = NULL);
Command::releaseResources();
for (auto it : pinned_memory_) {
it->release();
}
}
//! Adds pinned memory, used in this command for later release
virtual void AddPinnedMemory(Memory* pinned) override { pinned_memory_.push_back(pinned); }
bool validateMemory();
bool validatePeerMemory();
};
+3
Просмотреть файл
@@ -173,6 +173,9 @@ void HostQueue::finish(bool cpu_wait) {
(vdev()->QueuedAsyncHandlers().load() > DEBUG_HIP_BLOCK_SYNC)) {
cpu_wait = true;
}
} else {
// Force CPU wait for OpenCL, since the tests may check OCL command status after finish
cpu_wait = true;
}
size_t batchSize = GetSubmissionBatchSize();