diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 8429355360..88a33cbdfe 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -127,6 +127,9 @@ void Timestamp::checkGpuTime() { if (HwProfiling()) { uint64_t start = std::numeric_limits::max(); uint64_t end = 0; + uint64_t sdmaStart = std::numeric_limits::max(); + uint64_t sdmaEnd = 0; + for (auto it : signals_) { amd::ScopedLock lock(it->LockSignalOps()); @@ -140,41 +143,45 @@ void Timestamp::checkGpuTime() { if (command().GetBatchHead() == nullptr || command().profilingInfo().marker_ts_ || command().type() == CL_COMMAND_TASK) { hsa_amd_profiling_dispatch_time_t time = {}; + hsa_amd_profiling_async_copy_time_t timeSdma = {}; amd_signal_t* amdSignal = reinterpret_cast(it->signal_.handle); - if (it->engine_ == HwQueueEngine::Compute) { - Hsa::profiling_get_dispatch_time(gpu()->gpu_device(), it->signal_, &time); + if (it->engine_ == HwQueueEngine::SdmaInter || it->engine_ == HwQueueEngine::SdmaRead || + it->engine_ == HwQueueEngine::SdmaWrite || it->engine_ == HwQueueEngine::SdmaIntra) { + Hsa::profiling_get_async_copy_time(it->signal_, &timeSdma); + sdmaStart = std::min(timeSdma.start, sdmaStart); + sdmaEnd = std::max(timeSdma.end, sdmaEnd); + // set dispatch time to be used in logging. + time.start = timeSdma.start; + time.end = timeSdma.end; } else { - hsa_amd_profiling_async_copy_time_t time_sdma = {}; - Hsa::profiling_get_async_copy_time(it->signal_, &time_sdma); - time.start = time_sdma.start; - time.end = time_sdma.end; + Hsa::profiling_get_dispatch_time(gpu()->gpu_device(), it->signal_, &time); + start = std::min(time.start, start); + end = std::max(time.end, end); } - start = std::min(time.start, start); - end = std::max(time.end, end); - if ((command().type() == CL_COMMAND_TASK) && (it->flags_.isPacketDispatch_ == true)) { static_cast(command()).addTimestamps(time.start, time.end); } ClPrint(amd::LOG_INFO, amd::LOG_TS, - "Signal = (0x%lx), Translated start/end = %ld / %ld, " - "Elapsed = %ld ns, ticks start/end = %ld / %ld, Ticks elapsed = %ld", + "Signal = (0x%lx), Translated start/end = %ld / %ld, Elapsed = %ld ns, " + "ticks start/end = %ld / %ld, Ticks elapsed = %ld, Engine = %u", it->signal_.handle, time.start, time.end, time.end - time.start, - amdSignal->start_ts, amdSignal->end_ts, amdSignal->end_ts - amdSignal->start_ts); + amdSignal->start_ts, amdSignal->end_ts, amdSignal->end_ts - amdSignal->start_ts, + it->engine_); } it->flags_.done_ = true; } signals_.clear(); - if (end != 0) { + if (end != 0 || sdmaEnd != 0) { // Check if it's the first execution and update start time if (!accum_ena_) { - start_ = start * ticksToTime_; + start_ = ((sdmaEnd != 0) ? sdmaStart : start) * ticksToTime_; accum_ena_ = true; } // Progress the end time always - end_ = end * ticksToTime_; + end_ = ((sdmaEnd != 0) ? sdmaEnd : end) * ticksToTime_; } } }