diff --git a/rocclr/runtime/platform/command.cpp b/rocclr/runtime/platform/command.cpp index 83bbb16fd1..b540d6b8fc 100644 --- a/rocclr/runtime/platform/command.cpp +++ b/rocclr/runtime/platform/command.cpp @@ -187,14 +187,16 @@ bool Event::notifyCmdQueue() { const Event::EventWaitList Event::nullWaitList(0); -Command::Command(HostQueue& queue, cl_command_type type, const EventWaitList& eventWaitList) +Command::Command(HostQueue& queue, cl_command_type type, + const EventWaitList& eventWaitList, uint32_t commandWaitBits) : Event(queue), queue_(&queue), next_(NULL), type_(type), exception_(0), data_(NULL), - eventWaitList_(eventWaitList) { + eventWaitList_(eventWaitList), + commandWaitBits_(commandWaitBits) { // Retain the commands from the event wait list. std::for_each(eventWaitList.begin(), eventWaitList.end(), std::mem_fun(&Command::retain)); } @@ -218,7 +220,8 @@ void Command::enqueue() { } queue_->append(*this); queue_->flush(); - if (queue_->device().settings().waitCommand_ && (type_ != 0)) { + if ((queue_->device().settings().waitCommand_ && (type_ != 0)) || + ((commandWaitBits_ & 0x2) != 0)) { awaitCompletion(); } } @@ -228,7 +231,7 @@ const Context& Command::context() const { return queue_->context(); } NDRangeKernelCommand::NDRangeKernelCommand(HostQueue& queue, const EventWaitList& eventWaitList, Kernel& kernel, const NDRangeContainer& sizes, uint32_t sharedMemBytes, uint32_t extraParam) - : Command(queue, CL_COMMAND_NDRANGE_KERNEL, eventWaitList) + : Command(queue, CL_COMMAND_NDRANGE_KERNEL, eventWaitList, AMD_SERIALIZE_KERNEL) , kernel_(kernel) , sizes_(sizes) , sharedMemBytes_(sharedMemBytes) diff --git a/rocclr/runtime/platform/command.hpp b/rocclr/runtime/platform/command.hpp index cf8b2ed507..2a52954d5f 100644 --- a/rocclr/runtime/platform/command.hpp +++ b/rocclr/runtime/platform/command.hpp @@ -91,8 +91,8 @@ class Event : public RuntimeObject { uint64_t submitted_; uint64_t start_; uint64_t end_; - bool enabled_; //!< Profiling enabled for the wave limiter - uint32_t waves_; //!< The number of waves used in a dispatch + bool enabled_; //!< Profiling enabled for the wave limiter + uint32_t waves_; //!< The number of waves used in a dispatch ProfilingCallback* callback_; void clear() { queued_ = 0ULL; @@ -202,9 +202,13 @@ class Command : public Event { //! The Events that need to complete before this command is submitted. EventWaitList eventWaitList_; + //! Force await completion of previous command + //! 0x1 - wait before enqueue, 0x2 - wait after, 0x3 - wait both. + uint32_t commandWaitBits_; + //! Construct a new command of the given OpenCL type. - Command(HostQueue& queue, cl_command_type type, - const EventWaitList& eventWaitList = nullWaitList); + Command(HostQueue& queue, cl_command_type type, const EventWaitList& eventWaitList = nullWaitList, + uint32_t commandWaitBits = 0); //! Construct a new command of the given OpenCL type. Command(cl_command_type type) @@ -214,7 +218,8 @@ class Command : public Event { type_(type), exception_(0), data_(NULL), - eventWaitList_(nullWaitList) {} + eventWaitList_(nullWaitList), + commandWaitBits_(0) {} bool terminate() { if (Agent::shouldPostEventEvents() && type() != 0) { @@ -271,6 +276,9 @@ class Command : public Event { //! Return the context for this event. virtual const Context& context() const; + + //! Get command wait bits + uint32_t getWaitBits() const { return commandWaitBits_; } }; class UserEvent : public Command { @@ -319,7 +327,7 @@ class OneMemoryArgCommand : public Command { public: OneMemoryArgCommand(HostQueue& queue, cl_command_type type, const EventWaitList& eventWaitList, Memory& memory) - : Command(queue, type, eventWaitList), memory_(&memory) { + : Command(queue, type, eventWaitList, AMD_SERIALIZE_COPY), memory_(&memory) { memory_->retain(); } @@ -342,7 +350,9 @@ class TwoMemoryArgsCommand : public Command { public: TwoMemoryArgsCommand(HostQueue& queue, cl_command_type type, const EventWaitList& eventWaitList, Memory& memory1, Memory& memory2) - : Command(queue, type, eventWaitList), memory1_(&memory1), memory2_(&memory2) { + : Command(queue, type, eventWaitList, AMD_SERIALIZE_COPY), + memory1_(&memory1), + memory2_(&memory2) { memory1_->retain(); memory2_->retain(); } @@ -788,7 +798,8 @@ class NDRangeKernelCommand : public Command { //! Return the cooperative multi device groups mode bool cooperativeMultiDeviceGroups() const { - return (extraParam_ & CooperativeMultiDeviceGroups) ? true : false; } + return (extraParam_ & CooperativeMultiDeviceGroups) ? true : false; + } //! Set the local work size. void setLocalWorkSize(const NDRange& local) { sizes_.local() = local; } @@ -995,7 +1006,7 @@ class ThreadTraceMemObjectsCommand : public Command { private: std::vector memObjects_; //!< The list of memory objects,bound to the thread trace size_t sizeMemObjects_; //!< The size of each memory object from memObjects_ list (all memory - //!objects have the smae size) + //! objects have the smae size) ThreadTrace& threadTrace_; //!< The Thread Trace object }; @@ -1196,7 +1207,7 @@ class SvmMapMemoryCommand : public Command { Memory* svmMem_; //!< the pointer to the amd::Memory object corresponding the svm pointer mapped Coord3D size_; //!< the map size Coord3D origin_; //!< the origin of the mapped svm pointer shift from the beginning of svm space - //!allocated + //! allocated cl_map_flags flags_; //!< map flags void* svmPtr_; @@ -1315,12 +1326,11 @@ class TransferBufferFileCommand : public OneMemoryArgCommand { class CopyMemoryP2PCommand : public CopyMemoryCommand { public: - CopyMemoryP2PCommand(HostQueue& queue, cl_command_type cmdType, const EventWaitList& eventWaitList, - Memory& srcMemory, Memory& dstMemory, Coord3D srcOrigin, Coord3D dstOrigin, - Coord3D size) - : CopyMemoryCommand(queue, cmdType, eventWaitList, srcMemory, dstMemory, srcOrigin, dstOrigin, size) - { - } + CopyMemoryP2PCommand(HostQueue& queue, cl_command_type cmdType, + const EventWaitList& eventWaitList, Memory& srcMemory, Memory& dstMemory, + Coord3D srcOrigin, Coord3D dstOrigin, Coord3D size) + : CopyMemoryCommand(queue, cmdType, eventWaitList, srcMemory, dstMemory, srcOrigin, dstOrigin, + size) {} virtual void submit(device::VirtualDevice& device) { device.submitCopyMemoryP2P(*this); } diff --git a/rocclr/runtime/platform/commandqueue.cpp b/rocclr/runtime/platform/commandqueue.cpp index 95ed258551..e559d23ffb 100644 --- a/rocclr/runtime/platform/commandqueue.cpp +++ b/rocclr/runtime/platform/commandqueue.cpp @@ -35,7 +35,8 @@ bool HostQueue::terminate() { Command* marker = nullptr; // Send a finish if the queue is still accepting commands. - { ScopedLock sl(queueLock_); + { + ScopedLock sl(queueLock_); if (thread_.acceptingCommands_) { marker = new Marker(*this, false); if (marker != nullptr) { @@ -50,7 +51,8 @@ bool HostQueue::terminate() { } // Wake-up the command loop, so it can exit - { ScopedLock sl(queueLock_); + { + ScopedLock sl(queueLock_); thread_.acceptingCommands_ = false; queueLock_.notify(); } @@ -159,6 +161,9 @@ void HostQueue::loop(device::VirtualDevice* virtualDevice) { void HostQueue::append(Command& command) { // We retain the command here. It will be released when its status // changes to CL_COMPLETE + if ((command.getWaitBits() & 0x1) != 0) { + finish(); + } command.retain(); command.setStatus(CL_QUEUED); queue_.enqueue(&command); @@ -214,4 +219,4 @@ bool DeviceQueue::create() { return result; } -} // namespace amd { +} // namespace amd diff --git a/rocclr/runtime/utils/flags.hpp b/rocclr/runtime/utils/flags.hpp index 81a7bcb3af..e77ac4d8f2 100644 --- a/rocclr/runtime/utils/flags.hpp +++ b/rocclr/runtime/utils/flags.hpp @@ -191,7 +191,13 @@ release(bool, GPU_DUMP_CODE_OBJECT, false, \ "Enable dump code object") \ release(uint, GPU_MAX_USWC_ALLOC_SIZE, 2048, \ "Set a limit in Mb on the maximum USWC allocation size" \ - "-1 = No limit") + "-1 = No limit") \ +release(uint, AMD_SERIALIZE_KERNEL, 0, \ + "Serialize kernel enqueue, 0x1 = Wait for completion before enqueue" \ + "0x2 = Wait for completion after enqueue 0x3 = both") \ +release(uint, AMD_SERIALIZE_COPY, 0, \ + "Serialize copies, 0x1 = Wait for completion before enqueue" \ + "0x2 = Wait for completion after enqueue 0x3 = both") namespace amd {