diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp index a40d2f1e44..cd03ade47d 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp @@ -1909,9 +1909,13 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const } bool needFlush = false; - dmaFlushMgmt_.findSplitSize(dev(), sizes.global().product(), hsaKernel.aqlCodeSize()); - if (dmaFlushMgmt().dispatchSplitSize() != 0) { - needFlush = true; + + // Avoid flushing when PerfCounter is enabled, to make sure PerfStart/dispatch/PerfEnd are in the same cmdBuffer + if (!state_.perfCounterEnabled_) { + dmaFlushMgmt_.findSplitSize(dev(), sizes.global().product(), hsaKernel.aqlCodeSize()); + if (dmaFlushMgmt().dispatchSplitSize() != 0) { + needFlush = true; + } } size_t newOffset[3] = {0, 0, 0}; @@ -2347,6 +2351,7 @@ void VirtualGPU::submitPerfCounter(amd::PerfCounterCommand& vcmd) { if (vcmd.getState() == amd::PerfCounterCommand::Begin) { Pal::SetClockModeInput input; Pal::SetClockModeOutput output = {}; + state_.perfCounterEnabled_ = true; input.clockMode = Pal::DeviceClockMode::Profiling; dev().iDev()->SetClockMode(input, &output); GpuEvent event; @@ -2360,6 +2365,7 @@ void VirtualGPU::submitPerfCounter(amd::PerfCounterCommand& vcmd) { iCmd()->CmdEndPerfExperiment(palPerf); eventEnd(MainEngine, event); setGpuEvent(event); + state_.perfCounterEnabled_ = false; } else { LogError("Unsupported performance counter state"); vcmd.setStatus(CL_INVALID_OPERATION); diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp index 65ee2f5ad1..4eabc1a902 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp @@ -180,9 +180,10 @@ class VirtualGPU : public device::VirtualDevice { //! The virtual GPU states union State { struct { - uint profiling_ : 1; //!< Profiling is enabled - uint forceWait_ : 1; //!< Forces wait in flush() - uint profileEnabled_ : 1; //!< Profiling is enabled for WaveLimiter + uint profiling_ : 1; //!< Profiling is enabled + uint forceWait_ : 1; //!< Forces wait in flush() + uint profileEnabled_ : 1; //!< Profiling is enabled for WaveLimiter + uint perfCounterEnabled_ : 1; //!< PerfCounter is enabled }; uint value_; State() : value_(0) {}