diff --git a/projects/clr/hipamd/src/hip_graph_internal.cpp b/projects/clr/hipamd/src/hip_graph_internal.cpp index 68a68f4592..42851cc1fb 100644 --- a/projects/clr/hipamd/src/hip_graph_internal.cpp +++ b/projects/clr/hipamd/src/hip_graph_internal.cpp @@ -413,7 +413,8 @@ hipError_t GraphExec::Init() { } status = CreateStreams(parallelLists_.size() - 1 + min_num_streams); } else { - status = CreateStreams(clonedGraph_->max_streams_); + // create extra stream to avoid queue collision with the default execution stream + status = CreateStreams(clonedGraph_->max_streams_ + 1); } if (status != hipSuccess) { return status; @@ -638,20 +639,24 @@ hipError_t EnqueueGraphWithSingleList(std::vector& topoOrder, hip::St } // ================================================================================================ -void Graph::UpdateStreams( - hip::Stream* launch_stream, - const std::vector& parallel_streams) { +void Graph::UpdateStreams(hip::Stream* launch_stream, + const std::vector& parallel_streams) { // Allocate array for parallel streams, based on the graph scheduling + current stream - streams_.resize(parallel_streams.size() + 1); - + // We create extra stream to avoid collision + streams_.resize(parallel_streams.size()); // Current stream is the default in the assignment streams_[0] = launch_stream; // Assign the streams in the array of all streams - for (uint32_t i = 0; i < parallel_streams.size(); ++i) { - streams_[i + 1] = parallel_streams[i]; + // Avoid stream that has collision with launch stream + for (uint32_t i = 1, j = 0; i < streams_.size(); j++) { + assert(j != parallel_streams.size()); + if (launch_stream->getQueueID() != parallel_streams[j]->getQueueID()) { + streams_[i++] = parallel_streams[j]; + } } } + // ================================================================================================ bool Graph::RunOneNode(Node node, bool wait) { if (node->launch_id_ == -1) { diff --git a/projects/clr/hipamd/src/hip_stream.cpp b/projects/clr/hipamd/src/hip_stream.cpp index 76a732acd7..86ca4cbc29 100644 --- a/projects/clr/hipamd/src/hip_stream.cpp +++ b/projects/clr/hipamd/src/hip_stream.cpp @@ -31,7 +31,7 @@ namespace hip { Stream::Stream(hip::Device* dev, Priority p, unsigned int f, bool null_stream, const std::vector& cuMask, hipStreamCaptureStatus captureStatus) : amd::HostQueue(*dev->asContext(), *dev->devices()[0], 0, amd::CommandQueue::RealTimeDisabled, - convertToQueuePriority(p), cuMask), + convertToQueuePriority(p), cuMask), lock_("Stream Callback lock"), device_(dev), priority_(p), @@ -40,10 +40,9 @@ Stream::Stream(hip::Device* dev, Priority p, unsigned int f, bool null_stream, cuMask_(cuMask), captureStatus_(captureStatus), originStream_(false), - captureID_(0) - { - device_->AddStream(this); - } + captureID_(0) { + device_->AddStream(this); +} // ================================================================================================ hipError_t Stream::EndCapture() { diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp index 45cdd1befd..f5aa124cce 100644 --- a/projects/clr/rocclr/device/device.hpp +++ b/projects/clr/rocclr/device/device.hpp @@ -1276,7 +1276,7 @@ class VirtualDevice : public amd::HeapObject { //! Return the physical device for this virtual device. const amd::Device& device() const { return device_(); } - + virtual uint64_t getQueueID() = 0; virtual void submitReadMemory(amd::ReadMemoryCommand& cmd) = 0; virtual void submitWriteMemory(amd::WriteMemoryCommand& cmd) = 0; virtual void submitCopyMemory(amd::CopyMemoryCommand& cmd) = 0; diff --git a/projects/clr/rocclr/device/pal/palvirtual.hpp b/projects/clr/rocclr/device/pal/palvirtual.hpp index 05258e961c..a6614df5d4 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.hpp +++ b/projects/clr/rocclr/device/pal/palvirtual.hpp @@ -311,6 +311,7 @@ class VirtualGPU : public device::VirtualDevice { amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal); ~VirtualGPU(); + uint64_t getQueueID() { return hwRing_; } void submitReadMemory(amd::ReadMemoryCommand& vcmd); void submitWriteMemory(amd::WriteMemoryCommand& vcmd); void submitCopyMemory(amd::CopyMemoryCommand& vcmd); diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp index d015b321d2..d404ee57ba 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp @@ -433,6 +433,7 @@ class VirtualGPU : public device::VirtualDevice { void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; } uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); } + uint64_t getQueueID() { return gpu_queue_->id; } // } roc OpenCL integration private: diff --git a/projects/clr/rocclr/platform/commandqueue.hpp b/projects/clr/rocclr/platform/commandqueue.hpp index 1c2118fe17..aa70631c62 100644 --- a/projects/clr/rocclr/platform/commandqueue.hpp +++ b/projects/clr/rocclr/platform/commandqueue.hpp @@ -294,6 +294,10 @@ class HostQueue : public CommandQueue { //! Get queue status bool GetQueueStatus() { return isActive_; } + uint64_t getQueueID() { + return thread_.vdev()->getQueueID(); + } + private: Command* head_; //!< Head of the batch list Command* tail_; //!< Tail of the batch list