From f3aedfbec048cd0f9df035eaf88b40a728cb3c77 Mon Sep 17 00:00:00 2001 From: Saleel Kudchadker Date: Thu, 21 Mar 2024 18:26:01 +0000 Subject: [PATCH] SWDEV-301667 - Create TS for each node recorded in graph - Create a vector to allow multiple TS to be stored in Command. - This would mean we dont wait for entire batch in Accumulate command to finish when we exhaust signals. - Reduce the number of signals created at init to 64. This min value may still need to be tuned but the KFD allows max of 4094 interrupt signals per device. - Store kernel names whenever they are available and not just when profiling. If we dynamically enable profiling like for Torch, a crash can happen if hipGraphInstantiate wasnt included in Torch profile scope beacuse we previously entered kernel names only when profiler is attached. Change-Id: I34e7881a25bbc763f82fdeb3408a8ea58e1ec006 [ROCm/clr commit: c157bfb2022076959c521269b27f34996c1ee730] --- projects/clr/opencl/amdocl/cl_gl.cpp | 2 +- projects/clr/rocclr/device/pal/palvirtual.cpp | 15 ++-- .../clr/rocclr/device/rocm/rocvirtual.cpp | 88 +++++++++---------- .../clr/rocclr/device/rocm/rocvirtual.hpp | 4 +- projects/clr/rocclr/platform/command.cpp | 1 - projects/clr/rocclr/platform/command.hpp | 19 ++-- projects/clr/rocclr/platform/interop_gl.cpp | 3 +- projects/clr/rocclr/utils/flags.hpp | 2 +- 8 files changed, 63 insertions(+), 71 deletions(-) diff --git a/projects/clr/opencl/amdocl/cl_gl.cpp b/projects/clr/opencl/amdocl/cl_gl.cpp index a6a2851d11..7ccfb0c00b 100644 --- a/projects/clr/opencl/amdocl/cl_gl.cpp +++ b/projects/clr/opencl/amdocl/cl_gl.cpp @@ -769,7 +769,7 @@ RUNTIME_ENTRY_RET(cl_event, clCreateEventFromGLsyncKHR, // initially set the status of fence as queued clglEvent->setStatus(CL_SUBMITTED); // store GLsync id of the fence in event in order to associate them together - clglEvent->setData(clGLsync); + clglEvent->data().emplace_back(clGLsync); amd::Event* evt = clglEvent; evt->retain(); *not_null(errcode_ret) = CL_SUCCESS; diff --git a/projects/clr/rocclr/device/pal/palvirtual.cpp b/projects/clr/rocclr/device/pal/palvirtual.cpp index da61f82314..388b68a028 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/device/pal/palvirtual.cpp @@ -3317,7 +3317,7 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) { return; } // Save the TimeStamp object in the current OCL event - command.setData(ts); + command.data().emplace_back(ts); profileTs_ = ts; state_.profileEnabled_ = true; } @@ -3325,7 +3325,8 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) { void VirtualGPU::profilingEnd(amd::Command& command) { // Get the TimeStamp object associated witht the current command - TimeStamp* ts = reinterpret_cast(command.data()); + TimeStamp* ts = !command.data().empty() ? reinterpret_cast(command.data().back()) + : nullptr; if (ts != nullptr) { // Check if the command actually did any GPU submission if (ts->isValid()) { @@ -3333,7 +3334,7 @@ void VirtualGPU::profilingEnd(amd::Command& command) { } else { // Destroy the TimeStamp object tsCache_->freeTimeStamp(ts); - command.setData(nullptr); + command.data().clear(); } } } @@ -3362,7 +3363,8 @@ bool VirtualGPU::profilingCollectResults(CommandBatch* cb, const amd::Event* wai first = cb->head_; while (nullptr != first) { // Get the TimeStamp object associated witht the current command - TimeStamp* ts = reinterpret_cast(first->data()); + TimeStamp* ts = !first->data().empty() ? reinterpret_cast(first->data().back()) + : nullptr; if (ts != nullptr) { ts->value(&startTimeStamp, &endTimeStamp); @@ -3379,7 +3381,8 @@ bool VirtualGPU::profilingCollectResults(CommandBatch* cb, const amd::Event* wai first = cb->head_; while (nullptr != first) { // Get the TimeStamp object associated witht the current command - TimeStamp* ts = reinterpret_cast(first->data()); + TimeStamp* ts = !first->data().empty() ? reinterpret_cast(first->data().back()) + : nullptr; current = first->getNext(); @@ -3389,7 +3392,7 @@ bool VirtualGPU::profilingCollectResults(CommandBatch* cb, const amd::Event* wai startTimeStamp -= readjustTimeGPU_; // Destroy the TimeStamp object tsCache_->freeTimeStamp(ts); - first->setData(nullptr); + first->data().clear(); } else { // For empty commands start/end is equal to // the end of the last valid command diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 1610aecfe1..54c6da78d2 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -194,23 +194,25 @@ bool HsaAmdSignalHandler(hsa_signal_value_t value, void* arg) { head = ts->command().GetBatchHead(); } while (head != nullptr) { - if (head->data() != nullptr) { - Timestamp* headTs = reinterpret_cast(head->data()); - ts->setParsedCommand(head); - for (auto it : headTs->Signals()) { - hsa_signal_value_t complete_val = (headTs->GetCallbackSignal().handle != 0) ? 1 : 0; - if (int64_t val = hsa_signal_load_relaxed(it->signal_) > complete_val) { - hsa_status_t result = hsa_amd_signal_async_handler(headTs->Signals()[0]->signal_, - HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne, - &HsaAmdSignalHandler, ts); - if (HSA_STATUS_SUCCESS != result) { - LogError("hsa_amd_signal_async_handler() failed to requeue the handler!"); - } else { - ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Requeue handler : value(%d), timestamp(%p)," - "handle(0x%lx)", static_cast(val), headTs, - headTs->HwProfiling() ? headTs->Signals()[0]->signal_.handle : 0); + if (!head->data().empty()) { + for (auto i = 0; i < head->data().size(); i++) { + Timestamp* headTs = reinterpret_cast(head->data()[i]); + ts->setParsedCommand(head); + for (auto it : headTs->Signals()) { + hsa_signal_value_t complete_val = (headTs->GetCallbackSignal().handle != 0) ? 1 : 0; + if (int64_t val = hsa_signal_load_relaxed(it->signal_) > complete_val) { + hsa_status_t result = hsa_amd_signal_async_handler(headTs->Signals()[0]->signal_, + HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne, + &HsaAmdSignalHandler, ts); + if (HSA_STATUS_SUCCESS != result) { + LogError("hsa_amd_signal_async_handler() failed to requeue the handler!"); + } else { + ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Requeue handler : value(%d), timestamp(%p)," + "handle(0x%lx)", static_cast(val), headTs, + headTs->HwProfiling() ? headTs->Signals()[0]->signal_.handle : 0); + } + return false; } - return false; } } } @@ -356,9 +358,7 @@ VirtualGPU::HwQueueTracker::~HwQueueTracker() { // ================================================================================================ bool VirtualGPU::HwQueueTracker::Create() { uint kSignalListSize = ROC_SIGNAL_POOL_SIZE; - if (activity_prof::IsEnabled(OP_ID_DISPATCH) || gpu_.profiling_) { - kSignalListSize = !flagIsDefault(ROC_SIGNAL_POOL_SIZE) ? ROC_SIGNAL_POOL_SIZE : 4 * Ki; - } + signal_list_.resize(kSignalListSize); hsa_agent_t agent = gpu_.gpu_device(); @@ -981,14 +981,14 @@ bool VirtualGPU::dispatchAqlPacket( inline bool VirtualGPU::dispatchAqlPacket(uint8_t* aqlpacket, amd::AccumulateCommand* vcmd) { amd::ScopedLock lock(execution()); if (vcmd != nullptr) { - profilingBegin(*vcmd, true, true); + profilingBegin(*vcmd, true); } dispatchBlockingWait(); constexpr size_t kPacketSize = 1; auto packet = reinterpret_cast(aqlpacket); dispatchGenericAqlPacket(packet, packet->header, packet->setup, false, kPacketSize); if (vcmd != nullptr) { - profilingEnd(*vcmd, true); + profilingEnd(*vcmd); } return true; } @@ -1455,23 +1455,18 @@ address VirtualGPU::allocKernelArguments(size_t size, size_t alignment) { * virtualgpu's timestamp_, saves the pointer timestamp_ to the command's data * and then calls start() to get the current host timestamp. */ -void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling, bool useCommandTs) { +void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) { if (command.profilingInfo().enabled_) { if (timestamp_ != nullptr) { LogWarning("Trying to create a second timestamp in VirtualGPU. \ This could have unintended consequences."); return; } - Timestamp* ts = useCommandTs ? reinterpret_cast(command.data()) : timestamp_; - if (ts == nullptr) { - // Without barrier profiling will wait for each individual signal - timestamp_ = new Timestamp(this, command); - command.setData(timestamp_); - timestamp_->start(); - } else { - timestamp_ = ts; - } + // Without barrier profiling will wait for each individual signal + timestamp_ = new Timestamp(this, command); + command.data().emplace_back(timestamp_); + timestamp_->start(); // Enable SDMA profiling on the first access if profiling is set // Its not per command basis @@ -1504,11 +1499,10 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling, bool * created for whatever command we are running and calls end() to get the * current host timestamp if no signal is available. */ -void VirtualGPU::profilingEnd(amd::Command& command, bool useCommandTs) { +void VirtualGPU::profilingEnd(amd::Command& command) { if (command.profilingInfo().enabled_) { - Timestamp* ts = useCommandTs ? reinterpret_cast(command.data()) : timestamp_; - if (ts->HwProfiling() == false) { - ts->end(); + if (timestamp_->HwProfiling() == false) { + timestamp_->end(); } timestamp_ = nullptr; } @@ -1541,8 +1535,8 @@ void VirtualGPU::updateCommandsState(amd::Command* list) const { // came before it to start and end with this first valid start time. current = list; while (current != nullptr) { - if (current->data() != nullptr) { - ts = reinterpret_cast(current->data()); + if (!current->data().empty()) { + ts = reinterpret_cast(current->data().back()); ts->getTime(&startTimeStamp, &endTimeStamp); break; } @@ -1564,13 +1558,15 @@ void VirtualGPU::updateCommandsState(amd::Command* list) const { current = list; while (current != nullptr) { if (current->profilingInfo().enabled_) { - if (current->data() != nullptr) { - // Since this is a valid command to get a timestamp, we use the - // timestamp provided by the runtime (saved in the data()) - ts = reinterpret_cast(current->data()); - ts->getTime(&startTimeStamp, &endTimeStamp); - ts->release(); - current->setData(nullptr); + if (!current->data().empty()) { + for (auto i = 0; i < current->data().size(); i++) { + // Since this is a valid command to get a timestamp, we use the + // timestamp provided by the runtime (saved in the data()) + ts = reinterpret_cast(current->data()[i]); + ts->getTime(&startTimeStamp, &endTimeStamp); + ts->release(); + } + current->data().clear(); } else { // If we don't have a command that contains a valid timestamp, // we simply use the end timestamp of the previous command. @@ -3467,7 +3463,7 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) { void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); - profilingBegin(vcmd, true, true); + profilingBegin(vcmd, true); uint8_t* aqlPacket = vcmd.getLastPacket(); if (aqlPacket != nullptr) { @@ -3489,7 +3485,7 @@ void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) { } } - profilingEnd(vcmd, true); + profilingEnd(vcmd); } // ================================================================================================ diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp index 25a5715642..c5f8fa4df2 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp @@ -306,8 +306,8 @@ class VirtualGPU : public device::VirtualDevice { bool create(); const Device& dev() const { return roc_device_; } - void profilingBegin(amd::Command& command, bool sdmaProfiling = false, bool useCommandTs = false); - void profilingEnd(amd::Command& command, bool useCommandTs = false); + void profilingBegin(amd::Command& command, bool sdmaProfiling = false); + void profilingEnd(amd::Command& command); void updateCommandsState(amd::Command* list) const; diff --git a/projects/clr/rocclr/platform/command.cpp b/projects/clr/rocclr/platform/command.cpp index 7e078c9a68..aec37d179b 100644 --- a/projects/clr/rocclr/platform/command.cpp +++ b/projects/clr/rocclr/platform/command.cpp @@ -317,7 +317,6 @@ Command::Command(HostQueue& queue, cl_command_type type, const EventWaitList& ev queue_(&queue), next_(nullptr), type_(type), - data_(nullptr), waitingEvent_(waitingEvent), eventWaitList_(eventWaitList), commandWaitBits_(commandWaitBits) { diff --git a/projects/clr/rocclr/platform/command.hpp b/projects/clr/rocclr/platform/command.hpp index cbd837a2d6..d60c95db66 100644 --- a/projects/clr/rocclr/platform/command.hpp +++ b/projects/clr/rocclr/platform/command.hpp @@ -258,7 +258,7 @@ class Command : public Event { Command* next_; //!< Next GPU command in the queue list Command* batch_head_ = nullptr; //!< The head of the batch commands cl_command_type type_; //!< This command's OpenCL type. - void* data_; + std::vector data_; const Event* waitingEvent_; //!< Waiting event associated with the marker protected: @@ -282,7 +282,6 @@ class Command : public Event { queue_(nullptr), next_(nullptr), type_(type), - data_(nullptr), waitingEvent_(nullptr), eventWaitList_(nullWaitList), commandWaitBits_(0) {} @@ -322,11 +321,9 @@ class Command : public Event { //! Return this command's OpenCL type. cl_command_type type() const { return type_; } - //! Return the opaque, device specific data for this command. - void* data() const { return data_; } + //! Return the opaque, device specific data vector for this command. + std::vector& data() { return data_; } - //! Set the opaque, device specific data for this command. - void setData(void* data) { data_ = data; } /*! \brief The execution engine for this command. * @@ -1273,17 +1270,13 @@ class AccumulateCommand : public Command { //! Add kernel name to the list if available void addKernelName(const std::string& kernelName) { - if (activity_prof::IsEnabled(OP_ID_DISPATCH)) { - // "^" is to indicate kernel is captured at instantiate - kernelNames_.push_back("^ " + kernelName); - } + // "^" is to indicate kernel is captured at instantiate + kernelNames_.push_back("^ " + kernelName); } //! Add kernel timestamp to the list if available void addTimestamps(uint64_t startTs, uint64_t endTs) { - if (activity_prof::IsEnabled(OP_ID_DISPATCH)) { - tsList_.push_back(std::make_pair(startTs, endTs)); - } + tsList_.push_back(std::make_pair(startTs, endTs)); } //! Return the kernel names diff --git a/projects/clr/rocclr/platform/interop_gl.cpp b/projects/clr/rocclr/platform/interop_gl.cpp index 342b034e69..20cf3a7d05 100644 --- a/projects/clr/rocclr/platform/interop_gl.cpp +++ b/projects/clr/rocclr/platform/interop_gl.cpp @@ -51,7 +51,8 @@ bool amd::ClGlEvent::waitForFence() { GLenum ret; // get fence id associated with fence event - GLsync gs = reinterpret_cast(command().data()); + GLsync gs = !command().data().empty() ? reinterpret_cast(command().data().back()) + : nullptr; if (!gs) return false; // Try to use DC and GLRC of current thread, if it doesn't exist diff --git a/projects/clr/rocclr/utils/flags.hpp b/projects/clr/rocclr/utils/flags.hpp index 72f41dc693..0230eddb25 100644 --- a/projects/clr/rocclr/utils/flags.hpp +++ b/projects/clr/rocclr/utils/flags.hpp @@ -219,7 +219,7 @@ release(uint, ROC_P2P_SDMA_SIZE, 1024, \ "The minimum size in KB for P2P transfer with SDMA") \ release(uint, ROC_AQL_QUEUE_SIZE, 16384, \ "AQL queue size in AQL packets") \ -release(uint, ROC_SIGNAL_POOL_SIZE, 4096, \ +release(uint, ROC_SIGNAL_POOL_SIZE, 64, \ "Initial size of HSA signal pool") \ release(uint, DEBUG_CLR_LIMIT_BLIT_WG, 16, \ "Limit the number of workgroups in blit operations") \