SWDEV-508279 - Improve HIP event profiling

There are 2 functional changes to this patch: * Use GPU timing for internal markers for HIP. * Measure CPU time closer to GPU timer, to reduce delta between GPU/CPU timestamp measurements. There are some smaller non-functional updates: * waifForFence -> waitForFence typo * Remove unused drmProfiling Change-Id: I4c5fa600a842ab60e454888779edcac8449a902a [ROCm/clr commit: 179801a750]
2025-01-31 11:48:44 +00:00
commit 4b4a35b86b
@@ -224,7 +224,9 @@ hipError_t Event::recordCommand(amd::Command*& command, amd::HostQueue* stream,
      releaseFlags = amd::Device::kCacheStateInvalid;
    }
    // Always submit a EventMarker.
-    command = new hip::EventMarker(*stream, !kMarkerDisableFlush, true, releaseFlags, batch_flush);
+    constexpr bool kMarkerTs = true;
+    command =
+        new hip::EventMarker(*stream, !kMarkerDisableFlush, kMarkerTs, releaseFlags, batch_flush);
  }
  return hipSuccess;
 }
@@ -433,7 +433,7 @@ bool VirtualGPU::Queue::flush() {
  // Make sure the slot isn't busy
  constexpr bool IbReuse = true;
  if (GPU_FLUSH_ON_EXECUTION) {
-    waifForFence<!IbReuse>(cmdBufIdSlot_);
+    waitForFence<!IbReuse>(cmdBufIdSlot_);
  }

  // Reset the counter of commands
@@ -444,7 +444,7 @@ bool VirtualGPU::Queue::flush() {

  if (cmdBufIdCurrent_ == GpuEvent::InvalidID) {
    // Wait for the last one
-    waifForFence<!IbReuse>(cmdBufIdSlot_);
+    waitForFence<!IbReuse>(cmdBufIdSlot_);
    cmdBufIdCurrent_ = 1;
    cmbBufIdRetired_ = 0;
  }
@@ -452,7 +452,7 @@ bool VirtualGPU::Queue::flush() {
  // Wrap current slot
  cmdBufIdSlot_ = cmdBufIdCurrent_ % max_command_buffers_;

-  waifForFence<IbReuse>(cmdBufIdSlot_);
+  waitForFence<IbReuse>(cmdBufIdSlot_);

  // Progress retired TS
  if ((cmdBufIdCurrent_ > max_command_buffers_) &&
@@ -511,7 +511,7 @@ bool VirtualGPU::Queue::waitForEvent(uint id) {

  uint slotId = id % max_command_buffers_;
  constexpr bool IbReuse = true;
-  bool result = waifForFence<!IbReuse>(slotId);
+  bool result = waitForFence<!IbReuse>(slotId);
  cmbBufIdRetired_ = id;
  return result;
 }
@@ -1170,7 +1170,7 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& vcmd) {
  // Find if virtual address is a CL allocation
  device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.destination(), &offset);

-  profilingBegin(vcmd, true);
+  profilingBegin(vcmd);

  memory->syncCacheFromHost(*this);
  cl_command_type type = vcmd.type();
@@ -1297,7 +1297,7 @@ void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& vcmd) {
  // Find if virtual address is a CL allocation
  device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.source(), &offset);

-  profilingBegin(vcmd, true);
+  profilingBegin(vcmd);

  bool entire = vcmd.isEntireMemory();

@@ -1613,7 +1613,7 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

-  profilingBegin(vcmd, true);
+  profilingBegin(vcmd);

  pal::Memory* memory = dev().getGpuMemory(&vcmd.memory());

@@ -1708,7 +1708,7 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& vcmd) {
      LogError("Unmap without map call");
      return;
    }
-    profilingBegin(vcmd, true);
+    profilingBegin(vcmd);

    // Check if image is a mipmap and assign a saved view
    amdImage = owner->asImage();
@@ -1874,7 +1874,7 @@ void VirtualGPU::submitFillMemory(amd::FillMemoryCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

-  profilingBegin(cmd, true);
+  profilingBegin(cmd);
  if (cmd.type() == CL_COMMAND_FILL_IMAGE) {
    if (!fillMemory(cmd.type(), &cmd.memory(), cmd.pattern(), cmd.patternSize(),
        cmd.origin(), cmd.size())) {
@@ -2064,7 +2064,7 @@ void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

-  profilingBegin(vcmd, true);
+  profilingBegin(vcmd);

  // no op for FGS supported device
  if (!dev().isFineGrainedSystem()) {
@@ -2103,7 +2103,7 @@ void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& vcmd) {
 void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());
-  profilingBegin(vcmd, true);
+  profilingBegin(vcmd);

  // no op for FGS supported device
  if (!dev().isFineGrainedSystem()) {
@@ -2139,7 +2139,7 @@ void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

-  profilingBegin(vcmd, true);
+  profilingBegin(vcmd);

  if (!dev().isFineGrainedSystem()) {
    size_t patternSize = vcmd.patternSize();
@@ -2171,7 +2171,7 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

-  profilingBegin(vcmd, true);
+  profilingBegin(vcmd);

  for (const auto& it : vcmd.memObjects()) {
    // Find device memory
@@ -2855,6 +2855,17 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
    if (!foundEvent) {
      state_.forceWait_ = true;
    }
+  } else if (amd::IS_HIP) {
+    // Use GPU based timing for HIP events
+
+    // Make sure VirtualGPU has an exclusive access to the resources
+    amd::ScopedLock lock(execution());
+    GpuEvent event;
+    profilingBegin(vcmd);
+    eventBegin(MainEngine);
+    eventEnd(MainEngine, event);
+    setGpuEvent(event);
+    profilingEnd(vcmd);
  }
 }

@@ -3361,6 +3372,8 @@ void VirtualGPU::waitEventLock(CommandBatch* cb) {
    amd::ScopedLock lock(execution());
    earlyDone = waitAllEngines(cb);
  }
+  // Get timestamp, incase readjustTimeGPU_ needs to be updated
+  uint64_t endTimeStampCPU = amd::Os::timeNanos();

  // Free resource cache if we have too many entries
  //! \note we do it here, when all engines are idle,
@@ -3384,7 +3397,6 @@ void VirtualGPU::waitEventLock(CommandBatch* cb) {
      // Get the timestamp value of the last command in the batch
      cb->lastTS_->value(&startTimeStampGPU, &endTimeStampGPU);

-      uint64_t endTimeStampCPU = amd::Os::timeNanos();
      // Adjust the base time by the execution time
      readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU;
    }
@@ -3413,7 +3425,7 @@ bool VirtualGPU::allocConstantBuffers() {
  return true;
 }

-void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
+void VirtualGPU::profilingBegin(amd::Command& command) {
  // Is profiling enabled?
  if (command.profilingInfo().enabled_) {
    // Allocate a timestamp object from the cache
@@ -137,7 +137,7 @@ class VirtualGPU : public device::VirtualDevice {
    Pal::Result UpdateAppPowerProfile();

    // ibReuse forces event wait without polling, to make sure event occured
-    template <bool ibReuse> bool waifForFence(uint cbId) const {
+    template <bool ibReuse> bool waitForFence(uint cbId) const {
      Pal::Result result = Pal::Result::Success;
      uint64_t start;
      uint64_t end;
@@ -394,9 +394,7 @@ class VirtualGPU : public device::VirtualDevice {
  void addConstBuffer(ConstantBuffer* cb) { constBufs_.push_back(cb); }

  //! Start the command profiling
-  void profilingBegin(amd::Command& command,     //!< Command queue object
-                      bool drmProfiling = false  //!< Measure DRM time
-  );
+  void profilingBegin(amd::Command& command);  //!< Command queue object

  //! End the command profiling
  void profilingEnd(amd::Command& command);
@@ -251,7 +251,7 @@ void HostQueue::loop(device::VirtualDevice* virtualDevice) {
    // Submit to the device queue.
    command->submit(*virtualDevice);

-    // if this is a user invisible marker command, then flush
+    // if this is a user invisible marker with a waiting event, then flush
    if (0 == command->type()) {
      virtualDevice->flush(head);
      tail = head = NULL;