From 56dedad98b8d8f321fb76c45dcd39fbbe0e313b4 Mon Sep 17 00:00:00 2001 From: German Date: Wed, 22 Mar 2023 19:14:15 -0400 Subject: [PATCH] SWDEV-380703 - sync all streams individually Avoid syncing blocking streams with the default stream, since that introduces extra command dependencies and doesn't allow to destroy memory after last submission Change-Id: I618e9bd2091c4cf9157125612d8c4759030c5a80 [ROCm/clr commit: 1e88d2c52f9ee1d1beb5c2b103048cad84656201] --- projects/clr/hipamd/src/hip_device.cpp | 6 +-- .../clr/hipamd/src/hip_device_runtime.cpp | 18 +------- .../clr/hipamd/src/hip_graph_internal.cpp | 2 +- .../clr/hipamd/src/hip_graph_internal.hpp | 2 +- projects/clr/hipamd/src/hip_internal.hpp | 12 ++--- projects/clr/hipamd/src/hip_memory.cpp | 45 ++++++------------- projects/clr/hipamd/src/hip_stream.cpp | 35 ++++++++++----- 7 files changed, 49 insertions(+), 71 deletions(-) diff --git a/projects/clr/hipamd/src/hip_device.cpp b/projects/clr/hipamd/src/hip_device.cpp index 8782b6c35e..d89ddbaaa0 100644 --- a/projects/clr/hipamd/src/hip_device.cpp +++ b/projects/clr/hipamd/src/hip_device.cpp @@ -26,8 +26,8 @@ namespace hip { // ================================================================================================ -hip::Stream* Device::NullStream(bool skip_alloc) { - if (null_stream_ == nullptr && !skip_alloc) { +hip::Stream* Device::NullStream() { + if (null_stream_ == nullptr) { null_stream_ = new Stream(this, Stream::Priority::Normal, 0, true); } @@ -138,7 +138,7 @@ Device::~Device() { } if (null_stream_!= nullptr) { - null_stream_->release(); + hip::Stream::Destroy(null_stream_); } } diff --git a/projects/clr/hipamd/src/hip_device_runtime.cpp b/projects/clr/hipamd/src/hip_device_runtime.cpp index 3fe2703e3f..fb6df662d7 100644 --- a/projects/clr/hipamd/src/hip_device_runtime.cpp +++ b/projects/clr/hipamd/src/hip_device_runtime.cpp @@ -513,23 +513,9 @@ hipError_t hipDeviceSetSharedMemConfig ( hipSharedMemConfig config ) { HIP_RETURN(hipErrorNotSupported); } -hipError_t hipDeviceSynchronize ( void ) { +hipError_t hipDeviceSynchronize() { HIP_INIT_API(hipDeviceSynchronize); - - hip::Stream* stream = hip::getNullStream(); - - if (!stream) { - HIP_RETURN(hipErrorOutOfMemory); - } - - if (hip::Stream::StreamCaptureOngoing(reinterpret_cast(stream)) == true) { - HIP_RETURN(hipErrorStreamCaptureUnsupported); - } - - stream->finish(); - - hip::Stream::syncNonBlockingStreams(hip::getCurrentDevice()->deviceId()); - + hip::Stream::SyncAllStreams(hip::getCurrentDevice()->deviceId()); HIP_RETURN(hipSuccess); } diff --git a/projects/clr/hipamd/src/hip_graph_internal.cpp b/projects/clr/hipamd/src/hip_graph_internal.cpp index 9b63a06f7e..7573b8b652 100644 --- a/projects/clr/hipamd/src/hip_graph_internal.cpp +++ b/projects/clr/hipamd/src/hip_graph_internal.cpp @@ -465,7 +465,7 @@ hipError_t hipGraphExec::CreateStreams(uint32_t num_streams) { hip::Stream::Priority::Normal, hipStreamNonBlocking); if (stream == nullptr || !stream->Create()) { if (stream != nullptr) { - stream->release(); + hip::Stream::Destroy(stream); } ClPrint(amd::LOG_ERROR, amd::LOG_CODE, "[hipGraph] Failed to create parallel stream!\n"); return hipErrorOutOfMemory; diff --git a/projects/clr/hipamd/src/hip_graph_internal.hpp b/projects/clr/hipamd/src/hip_graph_internal.hpp index e5c03eb2e9..3aab90081c 100644 --- a/projects/clr/hipamd/src/hip_graph_internal.hpp +++ b/projects/clr/hipamd/src/hip_graph_internal.hpp @@ -578,7 +578,7 @@ struct hipGraphExec { // terminated after it complete execution for (auto stream : parallel_streams_) { if (stream != nullptr) { - stream->release(); + hip::Stream::Destroy(stream); } } for (auto it = clonedNodes_.begin(); it != clonedNodes_.end(); it++) delete it->second; diff --git a/projects/clr/hipamd/src/hip_internal.hpp b/projects/clr/hipamd/src/hip_internal.hpp index ead3fc8e96..b8028be29c 100644 --- a/projects/clr/hipamd/src/hip_internal.hpp +++ b/projects/clr/hipamd/src/hip_internal.hpp @@ -259,7 +259,7 @@ namespace hip { std::unordered_set captureEvents_; unsigned long long captureID_; - static inline CommandQueue::Priority convertToQueuePriority(Priority p){ + static inline CommandQueue::Priority convertToQueuePriority(Priority p) { return p == Priority::High ? amd::CommandQueue::Priority::High : p == Priority::Low ? amd::CommandQueue::Priority::Low : amd::CommandQueue::Priority::Normal; } @@ -271,7 +271,6 @@ namespace hip { /// Creates the hip stream object, including AMD host queue bool Create(); - virtual bool terminate() override; /// Get device ID associated with the current stream; int DeviceId() const; /// Get HIP device associated with the stream @@ -289,8 +288,8 @@ namespace hip { /// Returns the CU mask for the current stream const std::vector GetCUMask() const { return cuMask_; } - /// Sync all non-blocking streams - static void syncNonBlockingStreams(int deviceId); + /// Sync all streams + static void SyncAllStreams(int deviceId); /// Check whether any blocking stream running static bool StreamCaptureBlocking(); @@ -298,6 +297,8 @@ namespace hip { /// Destroy all streams on a given device static void destroyAllStreams(int deviceId); + static void Destroy(hip::Stream* stream); + /// Check Stream Capture status to make sure it is done static bool StreamCaptureOngoing(hipStream_t hStream); @@ -450,10 +451,9 @@ namespace hip { void setFlags(unsigned int flags) { flags_ = flags; } void Reset(); - hip::Stream* NullStream(bool skip_alloc = false); + hip::Stream* NullStream(); Stream* GetNullStream(); - bool GetActiveStatus() { amd::ScopedLock lock(lock_); if (isActive_) return true; diff --git a/projects/clr/hipamd/src/hip_memory.cpp b/projects/clr/hipamd/src/hip_memory.cpp index edbe0ae367..c66d5044c2 100644 --- a/projects/clr/hipamd/src/hip_memory.cpp +++ b/projects/clr/hipamd/src/hip_memory.cpp @@ -75,14 +75,8 @@ hipError_t ihipFree(void *ptr) { if (memory_object != nullptr) { // Wait on the device, associated with the current memory object during allocation auto device_id = memory_object->getUserData().deviceId; - auto dev = g_devices[device_id]; - // Skip stream allocation, since if it wasn't allocated until free, then the device wasn't used - constexpr bool SkipStreamAlloc = true; - hip::Stream* stream = dev->NullStream(SkipStreamAlloc); - if (stream != nullptr) { - stream->finish(); - } - hip::Stream::syncNonBlockingStreams(device_id); + hip::Stream::SyncAllStreams(device_id); + // Find out if memory belongs to any memory pool if (!g_devices[device_id]->FreeMemory(memory_object, nullptr)) { // External mem is not svm. @@ -710,14 +704,11 @@ hipError_t ihipArrayDestroy(hipArray* array) { return hipErrorInvalidValue; } - for (auto& dev : g_devices) { - hip::Stream* stream = dev->NullStream(true); - if (stream != nullptr) { - stream->finish(); - } - } + auto image = as_amd(memObj); + // Wait on the device, associated with the current memory object during allocation + hip::Stream::SyncAllStreams(image->getUserData().deviceId); + image->release(); - as_amd(memObj)->release(); delete array; return hipSuccess; } @@ -984,7 +975,8 @@ amd::Image* ihipImageCreate(const cl_channel_order channelOrder, delete image; return nullptr; } - + // Save device ID image was creted on + image->getUserData().deviceId = hip::getCurrentDevice()->deviceId(); return image; } @@ -1204,12 +1196,7 @@ hipError_t ihipHostUnregister(void* hostPtr) { if (mem != nullptr) { // Wait on the device, associated with the current memory object during allocation - auto device_id = mem->getUserData().deviceId; - - hip::Stream* stream = g_devices[device_id]->NullStream(true); - if (stream != nullptr) { - stream->finish(); - } + hip::Stream::SyncAllStreams(mem->getUserData().deviceId); amd::MemObjMap::RemoveMemObj(hostPtr); for (const auto& device: g_devices) { @@ -3992,7 +3979,6 @@ hipError_t ihipMipmapArrayCreate(hipMipmappedArray_t* mipmapped_array_pptr, } hipError_t ihipMipmappedArrayDestroy(hipMipmappedArray_t mipmapped_array_ptr) { - if (mipmapped_array_ptr == nullptr) { return hipErrorInvalidValue; } @@ -4002,17 +3988,12 @@ hipError_t ihipMipmappedArrayDestroy(hipMipmappedArray_t mipmapped_array_ptr) { return hipErrorInvalidValue; } - for (auto& dev : g_devices) { - hip::Stream* stream = dev->NullStream(true); - if (stream != nullptr) { - stream->finish(); - } - } - - as_amd(mem_obj)->release(); + auto image = as_amd(mem_obj); + // Wait on the device, associated with the current memory object during allocation + hip::Stream::SyncAllStreams(image->getUserData().deviceId); + image->release(); delete mipmapped_array_ptr; - return hipSuccess; } diff --git a/projects/clr/hipamd/src/hip_stream.cpp b/projects/clr/hipamd/src/hip_stream.cpp index c41896714a..7d107c16e7 100644 --- a/projects/clr/hipamd/src/hip_stream.cpp +++ b/projects/clr/hipamd/src/hip_stream.cpp @@ -75,12 +75,12 @@ bool Stream::Create() { } // ================================================================================================ -bool Stream::terminate() { +void Stream::Destroy(hip::Stream* stream) { { amd::ScopedLock lock(streamSetLock); - streamSet.erase(this); + streamSet.erase(stream); } - return HostQueue::terminate(); + stream->release(); } // ================================================================================================ @@ -107,6 +107,7 @@ int Stream::DeviceId() const { return device_->deviceId(); } +// ================================================================================================ int Stream::DeviceId(const hipStream_t hStream) { // Copying locally into non-const variable just to get const away hipStream_t inputStream = hStream; @@ -120,17 +121,27 @@ int Stream::DeviceId(const hipStream_t hStream) { return deviceId; } -void Stream::syncNonBlockingStreams(int deviceId) { - amd::ScopedLock lock(streamSetLock); - for (auto& it : streamSet) { - if (it->Flags() & hipStreamNonBlocking) { +// ================================================================================================ +void Stream::SyncAllStreams(int deviceId) { + // Make a local copy to avoid stalls for GPU finish with multiple threads + std::vector streams; + streams.reserve(streamSet.size()); + { + amd::ScopedLock lock(streamSetLock); + for (auto it : streamSet) { if (it->DeviceId() == deviceId) { - it->finish(); + streams.push_back(it); + it->retain(); } } } + for (auto it : streams) { + it->finish(); + it->release(); + } } +// ================================================================================================ bool Stream::StreamCaptureBlocking() { amd::ScopedLock lock(streamSetLock); for (auto& it : streamSet) { @@ -152,7 +163,7 @@ void Stream::destroyAllStreams(int deviceId) { } } for (auto& it : toBeDeleted) { - it->release(); + hip::Stream::Destroy(it); } } @@ -271,7 +282,7 @@ static hipError_t ihipStreamCreate(hipStream_t* stream, return hipErrorOutOfMemory; } else if (!hStream->Create()) { - hStream->release(); + hip::Stream::Destroy(hStream); return hipErrorOutOfMemory; } @@ -292,7 +303,7 @@ stream_per_thread::stream_per_thread() { stream_per_thread::~stream_per_thread() { for (auto &stream:m_streams) { if (stream != nullptr && hip::isValid(stream)) { - reinterpret_cast(stream)->release(); + hip::Stream::Destroy(reinterpret_cast(stream)); stream = nullptr; } } @@ -482,7 +493,7 @@ hipError_t hipStreamDestroy(hipStream_t stream) { if (l_it != hip::tls.capture_streams_.end()) { hip::tls.capture_streams_.erase(l_it); } - s->release(); + hip::Stream::Destroy(s); HIP_RETURN(hipSuccess); }