diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp index 891397eed9..2315655934 100644 --- a/rocclr/device/rocm/rocvirtual.cpp +++ b/rocclr/device/rocm/rocvirtual.cpp @@ -451,13 +451,6 @@ bool VirtualGPU::dispatchGenericAqlPacket( uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_); hsa_signal_t signal; - assert(header != 0); - if (addSystemScope_) { - header &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE); - header |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE); - addSystemScope_ = false; - } - // TODO: placeholder to setup the kernel to populate start and end timestamp. if (timestamp_ != nullptr) { // Find signal slot @@ -2332,6 +2325,12 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const aqlHeaderWithOrder &= kAqlHeaderMask; } + if (addSystemScope_) { + aqlHeaderWithOrder &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE); + aqlHeaderWithOrder |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE); + addSystemScope_ = false; + } + // Dispatch the packet if (!dispatchAqlPacket( &dispatchPacket, aqlHeaderWithOrder, @@ -2455,6 +2454,7 @@ void VirtualGPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd) { amd::ScopedLock lock(execution()); profilingBegin(vcmd); + addSystemScope(); profilingEnd(vcmd); }