SWDEV-480209 - Handle GraphExec object release

=> GraphExec instance is destroyed before async launch completes,
destroy after all pending graph launches
=> Remove GraphExec destroy during next sync point(hipStreamSync,
hipDeviceSync etc..)

Change-Id: I4df682aae5787fd6e5240a7be936ce50361345d0
Bu işleme şunda yer alıyor:
Anusha GodavarthySurya
2024-10-01 12:00:46 +00:00
işlemeyi yapan: Anusha Godavarthy Surya
ebeveyn 05d6f75830
işleme f9f995c6d0
7 değiştirilmiş dosya ile 46 ekleme ve 74 silme
-2
Dosyayı Görüntüle
@@ -274,8 +274,6 @@ void Device::SyncAllStreams( bool cpu_wait) {
}
// Release freed memory for all memory pools on the device
ReleaseFreedMemory();
// Release all graph exec objects destroyed by user.
ReleaseGraphExec(hip::getCurrentDevice()->deviceId());
}
// ================================================================================================
-2
Dosyayı Görüntüle
@@ -448,8 +448,6 @@ hipError_t hipEventSynchronize(hipEvent_t event) {
hipError_t status = e->synchronize();
// Release freed memory for all memory pools on the device
g_devices[e->deviceId()]->ReleaseFreedMemory();
// Release all graph exec objects destroyed by user.
ReleaseGraphExec(e->deviceId());
HIP_RETURN(status);
}
+1 -11
Dosyayı Görüntüle
@@ -27,8 +27,6 @@
#include "hip_mempool_impl.hpp"
namespace hip {
extern std::unordered_map<GraphExec*, std::pair<hip::Stream*, bool>> GraphExecStatus_;
extern amd::Monitor GraphExecStatusLock_;
std::vector<hip::Stream*> g_captureStreams;
// StreamCaptureGlobalList lock
@@ -1422,16 +1420,8 @@ hipError_t hipGraphExecDestroy(hipGraphExec_t pGraphExec) {
if (pGraphExec == nullptr) {
HIP_RETURN(hipErrorInvalidValue);
}
amd::ScopedLock lock(GraphExecStatusLock_);
hip::GraphExec* ge = reinterpret_cast<hip::GraphExec*>(pGraphExec);
// bool found = false;
if (GraphExecStatus_.find(ge) == GraphExecStatus_.end()) {
ge->release();
} else {
// graph execution is under progress. destroy graphExec during next sync point
auto pair = GraphExecStatus_[ge];
GraphExecStatus_[ge] = std::make_pair(pair.first, true);
}
ge->release();
HIP_RETURN(hipSuccess);
}
+37 -45
Dosyayı Görüntüle
@@ -51,10 +51,6 @@ const char* GetGraphNodeTypeString(uint32_t op) {
}
namespace hip {
std::unordered_map<GraphExec *, std::pair<hip::Stream *, bool>>
GraphExecStatus_ ROCCLR_INIT_PRIORITY(101);
// Guards graph execution state
amd::Monitor GraphExecStatusLock_ ROCCLR_INIT_PRIORITY(101){true};
int GraphNode::nextID = 0;
int Graph::nextID = 0;
@@ -601,6 +597,15 @@ void UpdateStream(std::vector<std::vector<Node>>& parallelLists, hip::Stream* st
}
}
// ================================================================================================
void GraphExec::DecrementRefCount(cl_event event, cl_int command_exec_status, void* user_data) {
GraphExec* graphExec = reinterpret_cast<GraphExec*>(user_data);
graphExec->release();
}
// ================================================================================================
hipError_t EnqueueGraphWithSingleList(std::vector<hip::Node>& topoOrder, hip::Stream* hip_stream,
hip::GraphExec* graphExec) {
// Accumulate command tracks all the AQL packet batch that we submit to the HW. For now
@@ -893,52 +898,40 @@ hipError_t GraphExec::Run(hipStream_t graph_launch_stream) {
}
}
}
amd::ScopedLock lock(GraphExecStatusLock_);
GraphExecStatus_[this] = std::make_pair(launch_stream, false);
this->retain();
amd::Command* CallbackCommand = new amd::Marker(*launch_stream, kMarkerDisableFlush, {});
// we may not need to flush any caches.
CallbackCommand->setEventScope(amd::Device::kCacheStateIgnore);
amd::Event& event = CallbackCommand->event();
if (!event.setCallback(CL_COMPLETE, GraphExec::DecrementRefCount, this)) {
return hipErrorInvalidHandle;
}
CallbackCommand->enqueue();
// Add the new barrier to stall the stream, until the callback is done
amd::Command::EventWaitList eventWaitList;
eventWaitList.push_back(CallbackCommand);
amd::Command* block_command = new amd::Marker(*launch_stream, kMarkerDisableFlush, eventWaitList);
// we may not need to flush any caches.
block_command->setEventScope(amd::Device::kCacheStateIgnore);
if (block_command == nullptr) {
return hipErrorInvalidValue;
}
block_command->enqueue();
block_command->release();
CallbackCommand->release();
ResetQueueIndex();
return status;
}
void ReleaseGraphExec(int deviceId) {
// Release all graph exec objects destroyed by user.
amd::ScopedLock lock(GraphExecStatusLock_);
for (auto itr = GraphExecStatus_.begin(); itr != GraphExecStatus_.end();) {
auto pair = itr->second;
if (pair.first->DeviceId() == deviceId) {
if (pair.second == true) {
ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Release GraphExec");
(itr->first)->release();
}
GraphExecStatus_.erase(itr++);
} else {
itr++;
}
}
}
void ReleaseGraphExec(hip::Stream* stream) {
amd::ScopedLock lock(GraphExecStatusLock_);
for (auto itr = GraphExecStatus_.begin(); itr != GraphExecStatus_.end();) {
auto pair = itr->second;
if (pair.first == stream) {
if (pair.second == true) {
ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Release GraphExec");
(itr->first)->release();
}
GraphExecStatus_.erase(itr++);
} else {
++itr;
}
}
}
// ================================================================================================
bool GraphKernelArgManager::AllocGraphKernargPool(size_t pool_size) {
bool bStatus = true;
assert(pool_size > 0);
address graph_kernarg_base;
auto device = g_devices[ihipGetDevice()]->devices()[0];
// Current device is stored as part of tls. Save current device to destroy kernelArgs from the
// callback thread.
device_ = device;
if (device->info().largeBar_) {
graph_kernarg_base = reinterpret_cast<address>(device->deviceLocalAlloc(pool_size));
device_kernarg_pool_ = true;
@@ -977,13 +970,12 @@ address GraphKernelArgManager::AllocKernArg(size_t size, size_t alignment) {
}
void GraphKernelArgManager::ReadBackOrFlush() {
if (device_kernarg_pool_) {
auto device = g_devices[ihipGetDevice()]->devices()[0];
auto kernArgImpl = device->settings().kernel_arg_impl_;
if (device_kernarg_pool_ && device_) {
auto kernArgImpl = device_->settings().kernel_arg_impl_;
if (kernArgImpl == KernelArgImpl::DeviceKernelArgsHDP) {
*device->info().hdpMemFlushCntl = 1u;
auto kSentinel = *reinterpret_cast<volatile int*>(device->info().hdpMemFlushCntl);
*device_->info().hdpMemFlushCntl = 1u;
auto kSentinel = *reinterpret_cast<volatile int*>(device_->info().hdpMemFlushCntl);
} else if (kernArgImpl == KernelArgImpl::DeviceKernelArgsReadback &&
kernarg_graph_.back().kernarg_pool_addr_ != 0) {
address dev_ptr =
+8 -5
Dosyayı Görüntüle
@@ -160,11 +160,12 @@ class GraphKernelArgManager : public amd::ReferenceCountedObject, public amd::Gr
GraphKernelArgManager() : amd::ReferenceCountedObject() {}
~GraphKernelArgManager() {
//! Release the kernel arg pools
auto device = g_devices[ihipGetDevice()]->devices()[0];
for (auto& element : kernarg_graph_) {
device->hostFree(element.kernarg_pool_addr_, element.kernarg_pool_size_);
if (device_ != nullptr) {
for (auto& element : kernarg_graph_) {
device_->hostFree(element.kernarg_pool_addr_, element.kernarg_pool_size_);
}
kernarg_graph_.clear();
}
kernarg_graph_.clear();
}
// Allocate kernel arg pool for the given size.
@@ -185,7 +186,8 @@ class GraphKernelArgManager : public amd::ReferenceCountedObject, public amd::Gr
size_t kernarg_pool_size_; //! Size of the pool
size_t kernarg_pool_offset_; //! Current offset in the kernel arg alloc
};
bool device_kernarg_pool_ = false; //! Indicate if kernel pool in device mem
bool device_kernarg_pool_ = false; //! Indicate if kernel pool in device mem
amd::Device* device_ = nullptr; //! Device from where kernel arguments are allocated
std::vector<KernelArgPoolGraph> kernarg_graph_; //! Vector of allocated kernarg pool
using KernelArgImpl = device::Settings::KernelArgImpl;
};
@@ -828,6 +830,7 @@ struct GraphExec : public amd::ReferenceCountedObject {
GraphKernelArgManager* GetKernelArgManager() {
return kernArgManager_;
}
static void DecrementRefCount(cl_event event, cl_int command_exec_status, void* user_data);
};
struct ChildGraphNode : public GraphNode {
-2
Dosyayı Görüntüle
@@ -89,8 +89,6 @@ struct GraphNode;
struct GraphExec;
struct UserObject;
class Stream;
extern void ReleaseGraphExec(int deviceId);
extern void ReleaseGraphExec(hip::Stream* stream);
typedef struct ihipIpcMemHandle_st {
char ipc_handle[IHIP_IPC_MEM_HANDLE_SIZE]; ///< ipc memory handle on ROCr
size_t psize;
-7
Dosyayı Görüntüle
@@ -82,7 +82,6 @@ void Stream::Destroy(hip::Stream* stream) {
// ================================================================================================
bool Stream::terminate() {
HostQueue::terminate();
ReleaseGraphExec(this);
return true;
}
// ================================================================================================
@@ -368,12 +367,6 @@ hipError_t hipStreamSynchronize_common(hipStream_t stream) {
}
// Wait for the current host queue
hip_stream->finish(wait_for_cpu);
if (stream == nullptr) {
// null stream will sync with other streams.
ReleaseGraphExec(hip_stream->DeviceId());
} else {
ReleaseGraphExec(hip_stream);
}
// Release freed memory for all memory pools on the device
hip_stream->GetDevice()->ReleaseFreedMemory();
return hipSuccess;