diff --git a/projects/clr/hipamd/src/hip_event.cpp b/projects/clr/hipamd/src/hip_event.cpp index fb2d592f80..92bf3d32b5 100644 --- a/projects/clr/hipamd/src/hip_event.cpp +++ b/projects/clr/hipamd/src/hip_event.cpp @@ -175,7 +175,7 @@ hipError_t Event::streamWaitCommand(amd::Command*& command, hip::Stream* stream) command = new amd::Marker(*stream, kMarkerDisableFlush, eventWaitList); // Since we only need to have a dependency on an existing event, // we may not need to flush any caches. - command->setEventScope(amd::Device::kCacheStateIgnore); + command->setCommandEntryScope(amd::Device::kCacheStateIgnore); if (command == NULL) { return hipErrorOutOfMemory; diff --git a/projects/clr/hipamd/src/hip_event.hpp b/projects/clr/hipamd/src/hip_event.hpp index 84bb300891..7740b5ea37 100644 --- a/projects/clr/hipamd/src/hip_event.hpp +++ b/projects/clr/hipamd/src/hip_event.hpp @@ -87,7 +87,7 @@ class EventMarker : public amd::Marker { profilingInfo_.marker_ts_ = markerTs; profilingInfo_.batch_flush_ = batch_flush; profilingInfo_.clear(); - setEventScope(scope); + setCommandEntryScope(scope); } }; diff --git a/projects/clr/hipamd/src/hip_graph_internal.cpp b/projects/clr/hipamd/src/hip_graph_internal.cpp index 589430fc46..0ad1144b2f 100644 --- a/projects/clr/hipamd/src/hip_graph_internal.cpp +++ b/projects/clr/hipamd/src/hip_graph_internal.cpp @@ -739,7 +739,7 @@ hipError_t GraphExec::Run(hipStream_t graph_launch_stream) { this->retain(); amd::Command* CallbackCommand = new amd::Marker(*launch_stream, kMarkerDisableFlush, {}); // we may not need to flush any caches. - CallbackCommand->setEventScope(amd::Device::kCacheStateIgnore); + CallbackCommand->setCommandEntryScope(amd::Device::kCacheStateIgnore); amd::Event& event = CallbackCommand->event(); constexpr bool kBlocking = false; if (!event.setCallback(CL_COMPLETE, GraphExec::DecrementRefCount, this, kBlocking)) { diff --git a/projects/clr/hipamd/src/hip_memory.cpp b/projects/clr/hipamd/src/hip_memory.cpp index 2f07a5227f..c0db1a2548 100644 --- a/projects/clr/hipamd/src/hip_memory.cpp +++ b/projects/clr/hipamd/src/hip_memory.cpp @@ -664,13 +664,9 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin return hipSuccess; } else if (((srcMemory == nullptr) && (dstMemory != nullptr)) || ((srcMemory != nullptr) && (dstMemory == nullptr))) { - // Don't wait for unpinned H2D copy if staging is used for copy. If dstMemory is not null, it - // can still be a pinned host memory, hence the check on dst memory type. - isHostAsync &= - ((srcMemory == nullptr) && (dstMemory != nullptr && dstMemoryType == hipMemoryTypeDevice) && - AMD_DIRECT_DISPATCH && (sizeBytes <= stream.device().settings().stagedXferSize_)) - ? true - : false; + // Unpinned copy wait behavior is enforced in the lower copy layers so skip + // wait at top level except for MT path + isHostAsync &= AMD_DIRECT_DISPATCH ? true : false; } else if (srcMemory->GetDeviceById() == dstMemory->GetDeviceById()) { // Device to Device copies do not need to host side synchronization. if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeDevice) && @@ -690,7 +686,7 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin } command->enqueue(); if (!isHostAsync) { - command->queue()->finish(); + command->queue()->finishCommand(command); } else if (!isGPUAsync) { hip::Stream* pStream = hip::getNullStream(dstMemory->GetDeviceById()->context()); amd::Command::EventWaitList waitList; @@ -1826,7 +1822,7 @@ hipError_t ihipMemcpyDtoHCommand(amd::Command*& command, void* dstHost, amd::Coo amd::Memory* dstMemory = getMemoryObject(dstHost, dOffset); amd::Coord3D srcStart(srcRect.start_, 0, 0); - amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA); + amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::NONE); if (dstMemory) { amd::CopyMemoryCommand *copyCommand = new amd::CopyMemoryCommand( *stream, CL_COMMAND_COPY_BUFFER_RECT, amd::Command::EventWaitList{}, @@ -1874,7 +1870,7 @@ hipError_t ihipMemcpyHtoDCommand(amd::Command*& command, void* dstDevice, amd::C amd::Memory* srcMemory = getMemoryObject(srcHost, sOffset); amd::Coord3D dstStart(dstRect.start_, 0, 0); - amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA); + amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::NONE); if (srcMemory) { amd::CopyMemoryCommand *copyCommand = new amd::CopyMemoryCommand( *stream, CL_COMMAND_COPY_BUFFER_RECT, amd::Command::EventWaitList{}, @@ -1962,7 +1958,7 @@ hipError_t ihipMemcpyHtoACommand(amd::Command*& command, amd::Image* dstImage, size_t start = ihipGetbufferStart(static_cast(srcOrigin), static_cast(copyRegion), srcRowPitch, srcSlicePitch); - amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA); + amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::NONE); if (srcMemory) { amd::CopyMemoryCommand *copyCommand = new amd::CopyMemoryCommand( *stream, CL_COMMAND_COPY_BUFFER_TO_IMAGE, amd::Command::EventWaitList{}, @@ -2012,7 +2008,7 @@ hipError_t ihipMemcpyAtoHCommand(amd::Command*& command, void* dstHost, amd::Coo size_t start = ihipGetbufferStart(static_cast(dstOrigin), static_cast(copyRegion), dstRowPitch, dstSlicePitch); - amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA); + amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::NONE); if (dstMemory) { amd::CopyMemoryCommand *copyCommand = new amd::CopyMemoryCommand( *stream, CL_COMMAND_COPY_IMAGE_TO_BUFFER, amd::Command::EventWaitList{}, @@ -2334,7 +2330,7 @@ inline hipError_t ihipMemcpyCmdEnqueue(amd::Command* command, bool isAsync = fal } command->enqueue(); if (!isAsync) { - command->queue()->finish(); + command->queue()->finishCommand(command); } else if (stream != nullptr) { auto* newQueue = command->queue(); if (newQueue != stream) { diff --git a/projects/clr/hipamd/src/hip_module.cpp b/projects/clr/hipamd/src/hip_module.cpp index 5e6e136755..22913231e1 100644 --- a/projects/clr/hipamd/src/hip_module.cpp +++ b/projects/clr/hipamd/src/hip_module.cpp @@ -472,9 +472,9 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, if (stopEvent != nullptr) { hip::Event* eStop = reinterpret_cast(stopEvent); if (eStop->flags_ & hipEventDisableSystemFence) { - command->setEventScope(amd::Device::kCacheStateIgnore); + command->setCommandEntryScope(amd::Device::kCacheStateIgnore); } else { - command->setEventScope(amd::Device::kCacheStateSystem); + command->setCommandEntryScope(amd::Device::kCacheStateSystem); } // Enqueue Dispatch and bind the stop event command->enqueue(); diff --git a/projects/clr/rocclr/device/rocm/rocblit.cpp b/projects/clr/rocclr/device/rocm/rocblit.cpp index a150bf260e..5ec63c2495 100644 --- a/projects/clr/rocclr/device/rocm/rocblit.cpp +++ b/projects/clr/rocclr/device/rocm/rocblit.cpp @@ -309,7 +309,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d // Copy memory line by line ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, - "HSA Async Copy Rect dst=0x%zx, src=0x%zx, wait_event=0x%zx " + "HSA Async Copy Rect dst=0x%zx, src=0x%zx, wait_event=0x%zx, " "completion_signal=0x%zx", dstMem.base, srcMem.base, (wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle); @@ -1841,11 +1841,17 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo } ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Blit staging H2D copy dst=%p, stg buf=%p, " "dstOrigin=0x%x, size=%zu", dstAddr, stagingBuffer, origin[0], copySize); - // No cache flush is needed here as we use a staging buffer, and the acquire logic - // ensures that the cacheline is different and re-used only when L2 is flushed + bool kAttachSignal = false; + if (copyMetadata.isAsync_ == false) { + // If its a blocking call, attach signal to the packet which we can track for + // completion. Also flush caches as we may not need another packet to flush caches. + kAttachSignal = true; + gpu().addSystemScope(); + } result = shaderCopyBuffer(dstAddr, stagingBuffer, origin, srcOrigin, copySize, - entire, dev().settings().limit_blit_wg_, copyMetadata); + entire, dev().settings().limit_blit_wg_, + copyMetadata, kAttachSignal); if (!result) { break; } @@ -2233,19 +2239,23 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds } if (!result) { - // Flush caches for coherency as the MTYPE of the src buffer may be - // non-coherent which mean we need to read it again from memory. - // Also if its a device to device copy(intra device), we dont need flush // Check CL_MEM_SVM_ATOMICS flag to see if we used system_coarse_segment_ auto memFlags = srcMemory.owner()->getMemFlags(); bool srcSvmAtomics = (memFlags & CL_MEM_SVM_ATOMICS) != 0; - if (!srcSvmAtomics && srcMemory.isHostMemDirectAccess()) { + if ((!srcSvmAtomics && srcMemory.isHostMemDirectAccess()) || + (!copyMetadata.isAsync_)) { + // Flush caches for coherency as the MTYPE of the src buffer is + // non-coherent(ie read it again from memory). + // For device to device copy(intra device), we dont need a flush. + // If the source is host memory and the copy is blocking(aka memory need + // to be coherent), then add system scope. For non blocking rely on the release + // scope issued by synchronization packet. gpu().addSystemScope(); } result = shaderCopyBuffer(reinterpret_cast
(dstMemory.virtualAddress()), reinterpret_cast
(srcMemory.virtualAddress()), dstOrigin, srcOrigin, sizeIn, - entire, blitWg, copyMetadata); + entire, blitWg, copyMetadata, !copyMetadata.isAsync_); } synchronize(); diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 6e50b09e9d..7f43aaef35 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -502,7 +502,19 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal( prof_signal->flags_.done_ = false; prof_signal->engine_ = engine_; prof_signal->flags_.isPacketDispatch_ = false; - if (ts != 0) { + + // Store the HW event + amd::Command* cmd = gpu_.command(); + if (nullptr != cmd) { + // Release any existing HwEvent before setting new one for the same command + if (cmd->HwEvent() != nullptr) { + reinterpret_cast(cmd->HwEvent())->release(); + } + cmd->SetHwEvent(prof_signal); + prof_signal->retain(); + } + + if (ts != nullptr) { // Save HSA signal earlier to make sure the possible callback will have a valid // value for processing ts->retain(); @@ -533,13 +545,6 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal( ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Set Handler: handle(0x%lx), timestamp(%p)", prof_signal->signal_.handle, prof_signal); } - // Update the current command/marker with HW event - prof_signal->retain(); - ts->command().SetHwEvent(prof_signal); - } else if (ts->command().profilingInfo().marker_ts_) { - // Update the current command/marker with HW event - prof_signal->retain(); - ts->command().SetHwEvent(prof_signal); } } } @@ -1133,7 +1138,7 @@ inline bool VirtualGPU::dispatchAqlPacket( dispatchGenericAqlPacket(packet, packetHeader, packet->setup, false); packet->header = packetHeader; - profilingEnd(*vcmd); + profilingEnd(); return true; } @@ -1379,6 +1384,7 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative, // Initialize the last signal and dispatch flags timestamp_ = nullptr; + command_ = nullptr; hasPendingDispatch_ = false; profiling_ = profiling; cooperative_ = cooperative; @@ -1631,6 +1637,9 @@ address VirtualGPU::allocKernelArguments(size_t size, size_t alignment) { * and then calls start() to get the current host timestamp. */ void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) { + // Track the current command + command_ = &command; + // Disable profiling when command is being captured to prevent memory leak from created timestamp_ // which won't get freed, since the command is not being executed until graph launch if (!command.getPktCapturingState() && command.profilingInfo().enabled_) { @@ -1669,9 +1678,6 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) { } } } - if (command.getPktCapturingState()) { - currCmd_ = &command; - } } // ================================================================================================ @@ -1679,8 +1685,8 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) { * created for whatever command we are running and calls end() to get the * current host timestamp if no signal is available. */ -void VirtualGPU::profilingEnd(amd::Command& command) { - if (!command.getPktCapturingState() && command.profilingInfo().enabled_) { +void VirtualGPU::profilingEnd(bool clearHwEvent) { + if (!command_->getPktCapturingState() && command_->profilingInfo().enabled_) { if (timestamp_->HwProfiling() == false) { timestamp_->end(); } @@ -1689,7 +1695,19 @@ void VirtualGPU::profilingEnd(amd::Command& command) { if (AMD_DIRECT_DISPATCH) { assert(retainExternalSignals_ || Barriers().IsExternalSignalListEmpty()); } - currCmd_ = nullptr; + + // Certain commands like map/unmap memory may not need hw_events as its not a + // queue operation. In such cases clear already set events which may have been for sync + // before some memory map/unmap operation + if (clearHwEvent) { + if (command_->HwEvent() != nullptr) { + reinterpret_cast(command_->HwEvent())->release(); + command_->SetHwEvent(nullptr); + } + } + + // Clear the command tracking + command_ = nullptr; } // ================================================================================================ @@ -1877,7 +1895,7 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& cmd) { cmd.setStatus(CL_OUT_OF_RESOURCES); } - profilingEnd(cmd); + profilingEnd(); } void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& cmd) { @@ -1973,7 +1991,7 @@ void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& cmd) { cmd.destination().signalWrite(&dev()); } - profilingEnd(cmd); + profilingEnd(); } // ================================================================================================ @@ -1995,7 +2013,7 @@ void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd) { cmd.pfnFreeFunc()(as_cl(cmd.queue()->asCommandQueue()), svmPointers.size(), (void**)(&(svmPointers[0])), cmd.userData()); } - profilingEnd(cmd); + profilingEnd(); } // ================================================================================================ @@ -2018,9 +2036,12 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) { hsa_status_t status = hsa_amd_svm_prefetch_async( const_cast(cmd.dev_ptr()), cmd.count(), agent, wait_events.size(), wait_events.data(), active); + ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, + "HSA prefetch async dev_ptr=0x%zx, count=%d, wait_event=0x%zx, " + "completion_signal=0x%zx", const_cast(cmd.dev_ptr()), cmd.count(), + (wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle); - // Wait for the prefetch. Should skip wait, but may require extra tracking for kernel execution - if ((status != HSA_STATUS_SUCCESS) || !Barriers().WaitCurrent()) { + if ((status != HSA_STATUS_SUCCESS)) { Barriers().ResetCurrentSignal(); LogError("hsa_amd_svm_prefetch_async failed"); cmd.setStatus(CL_INVALID_OPERATION); @@ -2031,7 +2052,7 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) { } else { LogWarning("hsa_amd_svm_prefetch_async is ignored, because no HMM support"); } - profilingEnd(cmd); + profilingEnd(); } // ================================================================================================ @@ -2145,7 +2166,7 @@ void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& cmd) { cmd.OverrrideCommandType(copy_command_type_); copy_command_type_ = 0; } - profilingEnd(cmd); + profilingEnd(); } // ================================================================================================ @@ -2227,7 +2248,7 @@ void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) { // direct memcpy for FGS enabled system amd::SvmBuffer::memFill(cmd.dst(), cmd.src(), cmd.srcSize(), 1); } - profilingEnd(cmd); + profilingEnd(); } // ================================================================================================ @@ -2385,7 +2406,7 @@ void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) { cmd.destination().signalWrite(&dstDevMem->dev()); - profilingEnd(cmd); + profilingEnd(); } // ================================================================================================ @@ -2423,7 +2444,7 @@ void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) { } } - profilingEnd(cmd); + profilingEnd(); } // ================================================================================================ @@ -2463,7 +2484,7 @@ void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) { memory->clearUnmapInfo(cmd.svmPtr()); } - profilingEnd(cmd); + profilingEnd(); } // ================================================================================================ @@ -2567,7 +2588,7 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& cmd) { } } - profilingEnd(cmd); + profilingEnd(); } // ================================================================================================ @@ -2659,7 +2680,7 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& cmd) { devMemory->clearUnmapInfo(cmd.mapPtr()); - profilingEnd(cmd); + profilingEnd(); } // ================================================================================================ @@ -2735,18 +2756,15 @@ void VirtualGPU::submitFillMemory(amd::FillMemoryCommand& cmd) { bool force_blit = false; if (amd::IS_HIP) { - constexpr uint32_t kManagedAlloc = (CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_ALLOC_HOST_PTR); - // In case of HMM, use blit kernel instead of CPU memcpy - if ((cmd.memory().getMemFlags() & kManagedAlloc) == kManagedAlloc) { - force_blit = true; - } + // Always use blit for memset for HIP. + force_blit = true; } if (!fillMemory(cmd.type(), &cmd.memory(), cmd.pattern(), cmd.patternSize(), cmd.surface(), cmd.origin(), cmd.size(), force_blit)) { cmd.setStatus(CL_INVALID_OPERATION); } - profilingEnd(cmd); + profilingEnd(); } // ================================================================================================ @@ -2825,7 +2843,7 @@ void VirtualGPU::submitStreamOperation(amd::StreamOperationCommand& cmd) { } else { ShouldNotReachHere(); } - profilingEnd(cmd); + profilingEnd(); } // ================================================================================================ @@ -2838,8 +2856,9 @@ void VirtualGPU::submitBatchMemoryOperation(amd::BatchMemoryOperationCommand& cm if (!result) { LogError("submitBatchMemoryOperation failed!"); } - profilingEnd(cmd); + profilingEnd(); } + // ================================================================================================ void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources @@ -2850,7 +2869,7 @@ void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) { // Find the amd::Memory object for virtual ptr. vcmd.ptr() is vaddr. amd::Memory* vaddr_base_obj = amd::MemObjMap::FindVirtualMemObj(vcmd.ptr()); if (vaddr_base_obj == nullptr || !(vaddr_base_obj->getMemFlags() & CL_MEM_VA_RANGE_AMD)) { - profilingEnd(vcmd); + profilingEnd(); return; } @@ -2906,7 +2925,10 @@ void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) { } } - profilingEnd(vcmd); + // Since this is a memory operation, the HW event set for barrier packet + // may not encapsulate what the command wants to do. Hence clear the hw_event + constexpr bool kClearHwEvent = true; + profilingEnd(kClearHwEvent); } // ================================================================================================ @@ -2945,7 +2967,7 @@ void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) { amd::SvmBuffer::memFill(cmd.dst(), cmd.pattern(), cmd.patternSize(), cmd.times()); } - profilingEnd(cmd); + profilingEnd(); } // ================================================================================================ @@ -2976,7 +2998,7 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) { } } - profilingEnd(vcmd); + profilingEnd(); } // ================================================================================================ @@ -3264,7 +3286,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, amd::Memory* const* memories = reinterpret_cast(parameters + kernelParams.memoryObjOffset()); - bool isGraphCapture = currCmd_ != nullptr && currCmd_->getPktCapturingState(); + bool isGraphCapture = command_ != nullptr && command_->getPktCapturingState(); for (int j = 0; j < iteration; j++) { // Reset global size for dimension dim if split is needed if (dim != -1) { @@ -3485,9 +3507,9 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, if (!kernel.parameters().deviceKernelArgs() || gpuKernel.isInternalKernel()) { // Allocate buffer to hold kernel arguments if (isGraphCapture) { - argBuffer = currCmd_->getKernArgOffset(gpuKernel.KernargSegmentByteSize(), + argBuffer = command_->getKernArgOffset(gpuKernel.KernargSegmentByteSize(), gpuKernel.KernargSegmentAlignment()); - currCmd_->SetKernelName(gpuKernel.name()); + command_->SetKernelName(gpuKernel.name()); } else { ClPrint(amd::LOG_INFO, amd::LOG_KERN, "KernargSegmentByteSize = %lu " "KernargSegmentAlignment = %lu", gpuKernel.KernargSegmentByteSize(), @@ -3558,7 +3580,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, aqlHeaderWithOrder &= kAqlHeaderMask; } - if (vcmd != nullptr && vcmd->getEventScope() == amd::Device::kCacheStateSystem) { + if (vcmd != nullptr && vcmd->getCommandEntryScope() == amd::Device::kCacheStateSystem) { addSystemScope_ = true; } @@ -3576,8 +3598,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, // Dispatch the packet if (!dispatchAqlPacket(&dispatchPacket, aqlHeaderWithOrder, (sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS), - GPU_FLUSH_ON_EXECUTION, currCmd_->getPktCapturingState(), - currCmd_->getAqlPacket())) { + GPU_FLUSH_ON_EXECUTION, command_->getPktCapturingState(), + command_->getAqlPacket())) { return false; } } else { @@ -3676,7 +3698,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) { hasPendingDispatch_ = true; retainExternalSignals_ = true; - queue->profilingEnd(vcmd); + queue->profilingEnd(); } else { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); @@ -3690,7 +3712,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) { vcmd.setStatus(CL_INVALID_OPERATION); } - profilingEnd(vcmd); + profilingEnd(); } } @@ -3711,7 +3733,7 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) { profilingBegin(vcmd); if (timestamp_ != nullptr) { const Settings& settings = dev().settings(); - int32_t releaseFlags = vcmd.getEventScope(); + int32_t releaseFlags = vcmd.getCommandEntryScope(); if (releaseFlags == Device::CacheState::kCacheStateIgnore) { if (settings.barrier_value_packet_ && vcmd.profilingInfo().marker_ts_) { dispatchBarrierValuePacket(kBarrierVendorPacketNopScopeHeader, true); @@ -3728,7 +3750,7 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) { hasPendingDispatch_ = false; } } - profilingEnd(vcmd); + profilingEnd(); } } @@ -3747,7 +3769,7 @@ void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) { dispatchBarrierPacket(kNopPacketHeader, false); } - profilingEnd(vcmd); + profilingEnd(); } // ================================================================================================ @@ -3757,7 +3779,7 @@ void VirtualGPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd) { profilingBegin(vcmd); addSystemScope(); - profilingEnd(vcmd); + profilingEnd(); } // ================================================================================================ @@ -3765,7 +3787,7 @@ void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); profilingBegin(vcmd); - profilingEnd(vcmd); + profilingEnd(); } // ================================================================================================ diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp index 76f9b8ecb7..c243f4cb71 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp @@ -327,7 +327,7 @@ class VirtualGPU : public device::VirtualDevice { const Device& dev() const { return roc_device_; } void profilingBegin(amd::Command& command, bool sdmaProfiling = false); - void profilingEnd(amd::Command& command); + void profilingEnd(bool clearHwEvent = false); void updateCommandsState(amd::Command* list) const; @@ -428,6 +428,7 @@ class VirtualGPU : public device::VirtualDevice { HwQueueTracker& Barriers() { return barriers_; } Timestamp* timestamp() const { return timestamp_; } + amd::Command* command() const { return command_; } void* allocKernArg(size_t size, size_t alignment); bool isFenceDirty() const { return fence_dirty_; } @@ -528,6 +529,7 @@ class VirtualGPU : public device::VirtualDevice { }; Timestamp* timestamp_; + amd::Command* command_; //!< Current command hsa_agent_t gpu_device_; //!< Physical device hsa_queue_t* gpu_queue_; //!< Queue associated with a gpu hsa_barrier_and_packet_t barrier_packet_; @@ -579,7 +581,5 @@ class VirtualGPU : public device::VirtualDevice { std::atomic lastUsedSdmaEngineMask_; //!< Last Used SDMA Engine mask using KernelArgImpl = device::Settings::KernelArgImpl; - - amd::Command* currCmd_ = nullptr; //!< Current command under capture }; } diff --git a/projects/clr/rocclr/platform/command.cpp b/projects/clr/rocclr/platform/command.cpp index c77bfa2ac0..ebf47b8eb4 100644 --- a/projects/clr/rocclr/platform/command.cpp +++ b/projects/clr/rocclr/platform/command.cpp @@ -42,8 +42,8 @@ Event::Event(HostQueue& queue, bool profilingEnabled) hw_event_(nullptr), notify_event_(nullptr), device_(&queue.device()), - profilingInfo_(profilingEnabled), - event_scope_(Device::kCacheStateInvalid) { + profilingInfo_(profilingEnabled) { + event_entry_scope_.store(Device::kCacheStateInvalid, std::memory_order_relaxed); notified_.clear(); } @@ -53,8 +53,8 @@ Event::Event() status_(CL_SUBMITTED), hw_event_(nullptr), notify_event_(nullptr), - device_(nullptr), - event_scope_(Device::kCacheStateInvalid) { + device_(nullptr) { + event_entry_scope_.store(Device::kCacheStateInvalid, std::memory_order_relaxed); notified_.clear(); } diff --git a/projects/clr/rocclr/platform/command.hpp b/projects/clr/rocclr/platform/command.hpp index 6e5b5f6eda..68cad257f6 100644 --- a/projects/clr/rocclr/platform/command.hpp +++ b/projects/clr/rocclr/platform/command.hpp @@ -88,10 +88,13 @@ class Event : public RuntimeObject { std::atomic callbacks_; //!< linked list of callback entries. std::atomic status_; //!< current execution status. std::atomic_flag notified_; //!< Command queue was notified + void* hw_event_; //!< HW event ID associated with SW event Event* notify_event_; //!< Notify event, which should contain HW signal const Device* device_; //!< Device, this event associated with - int32_t event_scope_; //!< 2 - system scope, 1 - device scope, + + std::atomic event_entry_scope_; //!< Command entry scope + //!< 2 - system scope, 1 - device scope, //!< 0 - ignore, -1 - invalid protected: @@ -219,11 +222,15 @@ class Event : public RuntimeObject { //! Returns notify even associated with the current command Event* NotifyEvent() const { return notify_event_; } - //! Get release scope of the event - int32_t getEventScope() const { return event_scope_; } + //! Get entry scope of the event + int32_t getCommandEntryScope() const { + return event_entry_scope_.load(std::memory_order_relaxed); + } - //! Set release scope for the event - void setEventScope(int32_t scope) { event_scope_ = scope; } + //! Set entry scope for the event + void setCommandEntryScope(int32_t scope) { + event_entry_scope_.store(scope, std::memory_order_relaxed); + } }; union CopyMetadata { diff --git a/projects/clr/rocclr/platform/commandqueue.cpp b/projects/clr/rocclr/platform/commandqueue.cpp index ce5d7ae65d..a6d7077358 100644 --- a/projects/clr/rocclr/platform/commandqueue.cpp +++ b/projects/clr/rocclr/platform/commandqueue.cpp @@ -133,6 +133,24 @@ bool HostQueue::terminate() { return true; } +void HostQueue::finishCommand(Command* command) { + if (command == nullptr) { + command = getLastQueuedCommand(true); + if (command != nullptr) { + ClPrint(LOG_DEBUG, LOG_CMD, "No command, awaiting complete status on host"); + command->awaitCompletion(); + command->release(); + } + return; + } + // Check hardware event status for the specific command + static constexpr bool kWaitCompletion = true; + if (!device().IsHwEventReady(command->event(), kWaitCompletion)) { + ClPrint(LOG_DEBUG, LOG_CMD, "No HW event, awaiting complete status on host"); + command->awaitCompletion(); + } +} + void HostQueue::finish(bool cpu_wait) { Command* command = nullptr; if (IS_HIP) { diff --git a/projects/clr/rocclr/platform/commandqueue.hpp b/projects/clr/rocclr/platform/commandqueue.hpp index f3e4f45c26..94c63fca53 100644 --- a/projects/clr/rocclr/platform/commandqueue.hpp +++ b/projects/clr/rocclr/platform/commandqueue.hpp @@ -233,6 +233,9 @@ class HostQueue : public CommandQueue { //! Finish all queued commands void finish(bool cpu_wait = false); + //! Wait until finish of one command + void finishCommand(Command* command); + //! Check if hostQueue empty snapshot bool isEmpty();