From 4b4a35b86bc9c203e7a4a7d396d86bd0f4366d09 Mon Sep 17 00:00:00 2001 From: Aidan Belton-Schure Date: Fri, 31 Jan 2025 11:48:44 +0000 Subject: [PATCH] SWDEV-508279 - Improve HIP event profiling There are 2 functional changes to this patch: * Use GPU timing for internal markers for HIP. * Measure CPU time closer to GPU timer, to reduce delta between GPU/CPU timestamp measurements. There are some smaller non-functional updates: * waifForFence -> waitForFence typo * Remove unused drmProfiling Change-Id: I4c5fa600a842ab60e454888779edcac8449a902a [ROCm/clr commit: 179801a75064ec1c01b19f8eafc7aebffbaf98cb] --- projects/clr/hipamd/src/hip_event.cpp | 4 +- projects/clr/rocclr/device/pal/palvirtual.cpp | 42 ++++++++++++------- projects/clr/rocclr/device/pal/palvirtual.hpp | 6 +-- projects/clr/rocclr/platform/commandqueue.cpp | 2 +- 4 files changed, 33 insertions(+), 21 deletions(-) diff --git a/projects/clr/hipamd/src/hip_event.cpp b/projects/clr/hipamd/src/hip_event.cpp index 75892b098b..c04f470e6b 100644 --- a/projects/clr/hipamd/src/hip_event.cpp +++ b/projects/clr/hipamd/src/hip_event.cpp @@ -224,7 +224,9 @@ hipError_t Event::recordCommand(amd::Command*& command, amd::HostQueue* stream, releaseFlags = amd::Device::kCacheStateInvalid; } // Always submit a EventMarker. - command = new hip::EventMarker(*stream, !kMarkerDisableFlush, true, releaseFlags, batch_flush); + constexpr bool kMarkerTs = true; + command = + new hip::EventMarker(*stream, !kMarkerDisableFlush, kMarkerTs, releaseFlags, batch_flush); } return hipSuccess; } diff --git a/projects/clr/rocclr/device/pal/palvirtual.cpp b/projects/clr/rocclr/device/pal/palvirtual.cpp index 505cd2f832..b1120f5da2 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/device/pal/palvirtual.cpp @@ -433,7 +433,7 @@ bool VirtualGPU::Queue::flush() { // Make sure the slot isn't busy constexpr bool IbReuse = true; if (GPU_FLUSH_ON_EXECUTION) { - waifForFence(cmdBufIdSlot_); + waitForFence(cmdBufIdSlot_); } // Reset the counter of commands @@ -444,7 +444,7 @@ bool VirtualGPU::Queue::flush() { if (cmdBufIdCurrent_ == GpuEvent::InvalidID) { // Wait for the last one - waifForFence(cmdBufIdSlot_); + waitForFence(cmdBufIdSlot_); cmdBufIdCurrent_ = 1; cmbBufIdRetired_ = 0; } @@ -452,7 +452,7 @@ bool VirtualGPU::Queue::flush() { // Wrap current slot cmdBufIdSlot_ = cmdBufIdCurrent_ % max_command_buffers_; - waifForFence(cmdBufIdSlot_); + waitForFence(cmdBufIdSlot_); // Progress retired TS if ((cmdBufIdCurrent_ > max_command_buffers_) && @@ -511,7 +511,7 @@ bool VirtualGPU::Queue::waitForEvent(uint id) { uint slotId = id % max_command_buffers_; constexpr bool IbReuse = true; - bool result = waifForFence(slotId); + bool result = waitForFence(slotId); cmbBufIdRetired_ = id; return result; } @@ -1170,7 +1170,7 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& vcmd) { // Find if virtual address is a CL allocation device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.destination(), &offset); - profilingBegin(vcmd, true); + profilingBegin(vcmd); memory->syncCacheFromHost(*this); cl_command_type type = vcmd.type(); @@ -1297,7 +1297,7 @@ void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& vcmd) { // Find if virtual address is a CL allocation device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.source(), &offset); - profilingBegin(vcmd, true); + profilingBegin(vcmd); bool entire = vcmd.isEntireMemory(); @@ -1613,7 +1613,7 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); - profilingBegin(vcmd, true); + profilingBegin(vcmd); pal::Memory* memory = dev().getGpuMemory(&vcmd.memory()); @@ -1708,7 +1708,7 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& vcmd) { LogError("Unmap without map call"); return; } - profilingBegin(vcmd, true); + profilingBegin(vcmd); // Check if image is a mipmap and assign a saved view amdImage = owner->asImage(); @@ -1874,7 +1874,7 @@ void VirtualGPU::submitFillMemory(amd::FillMemoryCommand& cmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); - profilingBegin(cmd, true); + profilingBegin(cmd); if (cmd.type() == CL_COMMAND_FILL_IMAGE) { if (!fillMemory(cmd.type(), &cmd.memory(), cmd.pattern(), cmd.patternSize(), cmd.origin(), cmd.size())) { @@ -2064,7 +2064,7 @@ void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); - profilingBegin(vcmd, true); + profilingBegin(vcmd); // no op for FGS supported device if (!dev().isFineGrainedSystem()) { @@ -2103,7 +2103,7 @@ void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& vcmd) { void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); - profilingBegin(vcmd, true); + profilingBegin(vcmd); // no op for FGS supported device if (!dev().isFineGrainedSystem()) { @@ -2139,7 +2139,7 @@ void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); - profilingBegin(vcmd, true); + profilingBegin(vcmd); if (!dev().isFineGrainedSystem()) { size_t patternSize = vcmd.patternSize(); @@ -2171,7 +2171,7 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); - profilingBegin(vcmd, true); + profilingBegin(vcmd); for (const auto& it : vcmd.memObjects()) { // Find device memory @@ -2855,6 +2855,17 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) { if (!foundEvent) { state_.forceWait_ = true; } + } else if (amd::IS_HIP) { + // Use GPU based timing for HIP events + + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + GpuEvent event; + profilingBegin(vcmd); + eventBegin(MainEngine); + eventEnd(MainEngine, event); + setGpuEvent(event); + profilingEnd(vcmd); } } @@ -3361,6 +3372,8 @@ void VirtualGPU::waitEventLock(CommandBatch* cb) { amd::ScopedLock lock(execution()); earlyDone = waitAllEngines(cb); } + // Get timestamp, incase readjustTimeGPU_ needs to be updated + uint64_t endTimeStampCPU = amd::Os::timeNanos(); // Free resource cache if we have too many entries //! \note we do it here, when all engines are idle, @@ -3384,7 +3397,6 @@ void VirtualGPU::waitEventLock(CommandBatch* cb) { // Get the timestamp value of the last command in the batch cb->lastTS_->value(&startTimeStampGPU, &endTimeStampGPU); - uint64_t endTimeStampCPU = amd::Os::timeNanos(); // Adjust the base time by the execution time readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU; } @@ -3413,7 +3425,7 @@ bool VirtualGPU::allocConstantBuffers() { return true; } -void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) { +void VirtualGPU::profilingBegin(amd::Command& command) { // Is profiling enabled? if (command.profilingInfo().enabled_) { // Allocate a timestamp object from the cache diff --git a/projects/clr/rocclr/device/pal/palvirtual.hpp b/projects/clr/rocclr/device/pal/palvirtual.hpp index a6614df5d4..e1fb718b3f 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.hpp +++ b/projects/clr/rocclr/device/pal/palvirtual.hpp @@ -137,7 +137,7 @@ class VirtualGPU : public device::VirtualDevice { Pal::Result UpdateAppPowerProfile(); // ibReuse forces event wait without polling, to make sure event occured - template bool waifForFence(uint cbId) const { + template bool waitForFence(uint cbId) const { Pal::Result result = Pal::Result::Success; uint64_t start; uint64_t end; @@ -394,9 +394,7 @@ class VirtualGPU : public device::VirtualDevice { void addConstBuffer(ConstantBuffer* cb) { constBufs_.push_back(cb); } //! Start the command profiling - void profilingBegin(amd::Command& command, //!< Command queue object - bool drmProfiling = false //!< Measure DRM time - ); + void profilingBegin(amd::Command& command); //!< Command queue object //! End the command profiling void profilingEnd(amd::Command& command); diff --git a/projects/clr/rocclr/platform/commandqueue.cpp b/projects/clr/rocclr/platform/commandqueue.cpp index 9eaa1a6d2d..ce5d7ae65d 100644 --- a/projects/clr/rocclr/platform/commandqueue.cpp +++ b/projects/clr/rocclr/platform/commandqueue.cpp @@ -251,7 +251,7 @@ void HostQueue::loop(device::VirtualDevice* virtualDevice) { // Submit to the device queue. command->submit(*virtualDevice); - // if this is a user invisible marker command, then flush + // if this is a user invisible marker with a waiting event, then flush if (0 == command->type()) { virtualDevice->flush(head); tail = head = NULL;