SWDEV-502365 - Track last used command

- This change tries to save extra synchronization packets we may insert
  as we didnt track the completion signals for every command. We track
the current enqueued command until it exits the enqueue stage. We also
record the exit scope to know if we flushed the caches
- Handle correct release scopes and store completion signal as HW events
- Use a new finishCommand implementation to only wait for the command
  passed as the argument

Change-Id: Ie4350c5dd24f5d48dfa6ccbabd892f0544caadcc


[ROCm/clr commit: e03e4f3b5d]
Этот коммит содержится в:
Saleel Kudchadker
2024-12-03 23:45:31 +00:00
родитель 77840f1cb9
Коммит c8f39ec2b0
12 изменённых файлов: 148 добавлений и 92 удалений
+1 -1
Просмотреть файл
@@ -175,7 +175,7 @@ hipError_t Event::streamWaitCommand(amd::Command*& command, hip::Stream* stream)
command = new amd::Marker(*stream, kMarkerDisableFlush, eventWaitList);
// Since we only need to have a dependency on an existing event,
// we may not need to flush any caches.
command->setEventScope(amd::Device::kCacheStateIgnore);
command->setCommandEntryScope(amd::Device::kCacheStateIgnore);
if (command == NULL) {
return hipErrorOutOfMemory;
+1 -1
Просмотреть файл
@@ -87,7 +87,7 @@ class EventMarker : public amd::Marker {
profilingInfo_.marker_ts_ = markerTs;
profilingInfo_.batch_flush_ = batch_flush;
profilingInfo_.clear();
setEventScope(scope);
setCommandEntryScope(scope);
}
};
+1 -1
Просмотреть файл
@@ -739,7 +739,7 @@ hipError_t GraphExec::Run(hipStream_t graph_launch_stream) {
this->retain();
amd::Command* CallbackCommand = new amd::Marker(*launch_stream, kMarkerDisableFlush, {});
// we may not need to flush any caches.
CallbackCommand->setEventScope(amd::Device::kCacheStateIgnore);
CallbackCommand->setCommandEntryScope(amd::Device::kCacheStateIgnore);
amd::Event& event = CallbackCommand->event();
constexpr bool kBlocking = false;
if (!event.setCallback(CL_COMPLETE, GraphExec::DecrementRefCount, this, kBlocking)) {
+9 -13
Просмотреть файл
@@ -664,13 +664,9 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin
return hipSuccess;
} else if (((srcMemory == nullptr) && (dstMemory != nullptr)) ||
((srcMemory != nullptr) && (dstMemory == nullptr))) {
// Don't wait for unpinned H2D copy if staging is used for copy. If dstMemory is not null, it
// can still be a pinned host memory, hence the check on dst memory type.
isHostAsync &=
((srcMemory == nullptr) && (dstMemory != nullptr && dstMemoryType == hipMemoryTypeDevice) &&
AMD_DIRECT_DISPATCH && (sizeBytes <= stream.device().settings().stagedXferSize_))
? true
: false;
// Unpinned copy wait behavior is enforced in the lower copy layers so skip
// wait at top level except for MT path
isHostAsync &= AMD_DIRECT_DISPATCH ? true : false;
} else if (srcMemory->GetDeviceById() == dstMemory->GetDeviceById()) {
// Device to Device copies do not need to host side synchronization.
if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeDevice) &&
@@ -690,7 +686,7 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin
}
command->enqueue();
if (!isHostAsync) {
command->queue()->finish();
command->queue()->finishCommand(command);
} else if (!isGPUAsync) {
hip::Stream* pStream = hip::getNullStream(dstMemory->GetDeviceById()->context());
amd::Command::EventWaitList waitList;
@@ -1826,7 +1822,7 @@ hipError_t ihipMemcpyDtoHCommand(amd::Command*& command, void* dstHost, amd::Coo
amd::Memory* dstMemory = getMemoryObject(dstHost, dOffset);
amd::Coord3D srcStart(srcRect.start_, 0, 0);
amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA);
amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::NONE);
if (dstMemory) {
amd::CopyMemoryCommand *copyCommand = new amd::CopyMemoryCommand(
*stream, CL_COMMAND_COPY_BUFFER_RECT, amd::Command::EventWaitList{},
@@ -1874,7 +1870,7 @@ hipError_t ihipMemcpyHtoDCommand(amd::Command*& command, void* dstDevice, amd::C
amd::Memory* srcMemory = getMemoryObject(srcHost, sOffset);
amd::Coord3D dstStart(dstRect.start_, 0, 0);
amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA);
amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::NONE);
if (srcMemory) {
amd::CopyMemoryCommand *copyCommand = new amd::CopyMemoryCommand(
*stream, CL_COMMAND_COPY_BUFFER_RECT, amd::Command::EventWaitList{},
@@ -1962,7 +1958,7 @@ hipError_t ihipMemcpyHtoACommand(amd::Command*& command, amd::Image* dstImage,
size_t start = ihipGetbufferStart(static_cast<size_t*>(srcOrigin),
static_cast<size_t*>(copyRegion), srcRowPitch, srcSlicePitch);
amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA);
amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::NONE);
if (srcMemory) {
amd::CopyMemoryCommand *copyCommand = new amd::CopyMemoryCommand(
*stream, CL_COMMAND_COPY_BUFFER_TO_IMAGE, amd::Command::EventWaitList{},
@@ -2012,7 +2008,7 @@ hipError_t ihipMemcpyAtoHCommand(amd::Command*& command, void* dstHost, amd::Coo
size_t start = ihipGetbufferStart(static_cast<size_t*>(dstOrigin),
static_cast<size_t*>(copyRegion), dstRowPitch, dstSlicePitch);
amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA);
amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::NONE);
if (dstMemory) {
amd::CopyMemoryCommand *copyCommand = new amd::CopyMemoryCommand(
*stream, CL_COMMAND_COPY_IMAGE_TO_BUFFER, amd::Command::EventWaitList{},
@@ -2334,7 +2330,7 @@ inline hipError_t ihipMemcpyCmdEnqueue(amd::Command* command, bool isAsync = fal
}
command->enqueue();
if (!isAsync) {
command->queue()->finish();
command->queue()->finishCommand(command);
} else if (stream != nullptr) {
auto* newQueue = command->queue();
if (newQueue != stream) {
+2 -2
Просмотреть файл
@@ -472,9 +472,9 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
if (stopEvent != nullptr) {
hip::Event* eStop = reinterpret_cast<hip::Event*>(stopEvent);
if (eStop->flags_ & hipEventDisableSystemFence) {
command->setEventScope(amd::Device::kCacheStateIgnore);
command->setCommandEntryScope(amd::Device::kCacheStateIgnore);
} else {
command->setEventScope(amd::Device::kCacheStateSystem);
command->setCommandEntryScope(amd::Device::kCacheStateSystem);
}
// Enqueue Dispatch and bind the stop event
command->enqueue();
+19 -9
Просмотреть файл
@@ -309,7 +309,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
// Copy memory line by line
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
"HSA Async Copy Rect dst=0x%zx, src=0x%zx, wait_event=0x%zx "
"HSA Async Copy Rect dst=0x%zx, src=0x%zx, wait_event=0x%zx, "
"completion_signal=0x%zx", dstMem.base, srcMem.base,
(wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle);
@@ -1841,11 +1841,17 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
}
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Blit staging H2D copy dst=%p, stg buf=%p, "
"dstOrigin=0x%x, size=%zu", dstAddr, stagingBuffer, origin[0], copySize);
// No cache flush is needed here as we use a staging buffer, and the acquire logic
// ensures that the cacheline is different and re-used only when L2 is flushed
bool kAttachSignal = false;
if (copyMetadata.isAsync_ == false) {
// If its a blocking call, attach signal to the packet which we can track for
// completion. Also flush caches as we may not need another packet to flush caches.
kAttachSignal = true;
gpu().addSystemScope();
}
result = shaderCopyBuffer(dstAddr, stagingBuffer,
origin, srcOrigin, copySize,
entire, dev().settings().limit_blit_wg_, copyMetadata);
entire, dev().settings().limit_blit_wg_,
copyMetadata, kAttachSignal);
if (!result) {
break;
}
@@ -2233,19 +2239,23 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds
}
if (!result) {
// Flush caches for coherency as the MTYPE of the src buffer may be
// non-coherent which mean we need to read it again from memory.
// Also if its a device to device copy(intra device), we dont need flush
// Check CL_MEM_SVM_ATOMICS flag to see if we used system_coarse_segment_
auto memFlags = srcMemory.owner()->getMemFlags();
bool srcSvmAtomics = (memFlags & CL_MEM_SVM_ATOMICS) != 0;
if (!srcSvmAtomics && srcMemory.isHostMemDirectAccess()) {
if ((!srcSvmAtomics && srcMemory.isHostMemDirectAccess()) ||
(!copyMetadata.isAsync_)) {
// Flush caches for coherency as the MTYPE of the src buffer is
// non-coherent(ie read it again from memory).
// For device to device copy(intra device), we dont need a flush.
// If the source is host memory and the copy is blocking(aka memory need
// to be coherent), then add system scope. For non blocking rely on the release
// scope issued by synchronization packet.
gpu().addSystemScope();
}
result = shaderCopyBuffer(reinterpret_cast<address>(dstMemory.virtualAddress()),
reinterpret_cast<address>(srcMemory.virtualAddress()),
dstOrigin, srcOrigin, sizeIn,
entire, blitWg, copyMetadata);
entire, blitWg, copyMetadata, !copyMetadata.isAsync_);
}
synchronize();
+75 -53
Просмотреть файл
@@ -502,7 +502,19 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
prof_signal->flags_.done_ = false;
prof_signal->engine_ = engine_;
prof_signal->flags_.isPacketDispatch_ = false;
if (ts != 0) {
// Store the HW event
amd::Command* cmd = gpu_.command();
if (nullptr != cmd) {
// Release any existing HwEvent before setting new one for the same command
if (cmd->HwEvent() != nullptr) {
reinterpret_cast<ProfilingSignal*>(cmd->HwEvent())->release();
}
cmd->SetHwEvent(prof_signal);
prof_signal->retain();
}
if (ts != nullptr) {
// Save HSA signal earlier to make sure the possible callback will have a valid
// value for processing
ts->retain();
@@ -533,13 +545,6 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Set Handler: handle(0x%lx), timestamp(%p)",
prof_signal->signal_.handle, prof_signal);
}
// Update the current command/marker with HW event
prof_signal->retain();
ts->command().SetHwEvent(prof_signal);
} else if (ts->command().profilingInfo().marker_ts_) {
// Update the current command/marker with HW event
prof_signal->retain();
ts->command().SetHwEvent(prof_signal);
}
}
}
@@ -1133,7 +1138,7 @@ inline bool VirtualGPU::dispatchAqlPacket(
dispatchGenericAqlPacket(packet, packetHeader, packet->setup, false);
packet->header = packetHeader;
profilingEnd(*vcmd);
profilingEnd();
return true;
}
@@ -1379,6 +1384,7 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
// Initialize the last signal and dispatch flags
timestamp_ = nullptr;
command_ = nullptr;
hasPendingDispatch_ = false;
profiling_ = profiling;
cooperative_ = cooperative;
@@ -1631,6 +1637,9 @@ address VirtualGPU::allocKernelArguments(size_t size, size_t alignment) {
* and then calls start() to get the current host timestamp.
*/
void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
// Track the current command
command_ = &command;
// Disable profiling when command is being captured to prevent memory leak from created timestamp_
// which won't get freed, since the command is not being executed until graph launch
if (!command.getPktCapturingState() && command.profilingInfo().enabled_) {
@@ -1669,9 +1678,6 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
}
}
}
if (command.getPktCapturingState()) {
currCmd_ = &command;
}
}
// ================================================================================================
@@ -1679,8 +1685,8 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
* created for whatever command we are running and calls end() to get the
* current host timestamp if no signal is available.
*/
void VirtualGPU::profilingEnd(amd::Command& command) {
if (!command.getPktCapturingState() && command.profilingInfo().enabled_) {
void VirtualGPU::profilingEnd(bool clearHwEvent) {
if (!command_->getPktCapturingState() && command_->profilingInfo().enabled_) {
if (timestamp_->HwProfiling() == false) {
timestamp_->end();
}
@@ -1689,7 +1695,19 @@ void VirtualGPU::profilingEnd(amd::Command& command) {
if (AMD_DIRECT_DISPATCH) {
assert(retainExternalSignals_ || Barriers().IsExternalSignalListEmpty());
}
currCmd_ = nullptr;
// Certain commands like map/unmap memory may not need hw_events as its not a
// queue operation. In such cases clear already set events which may have been for sync
// before some memory map/unmap operation
if (clearHwEvent) {
if (command_->HwEvent() != nullptr) {
reinterpret_cast<ProfilingSignal*>(command_->HwEvent())->release();
command_->SetHwEvent(nullptr);
}
}
// Clear the command tracking
command_ = nullptr;
}
// ================================================================================================
@@ -1877,7 +1895,7 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& cmd) {
cmd.setStatus(CL_OUT_OF_RESOURCES);
}
profilingEnd(cmd);
profilingEnd();
}
void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& cmd) {
@@ -1973,7 +1991,7 @@ void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& cmd) {
cmd.destination().signalWrite(&dev());
}
profilingEnd(cmd);
profilingEnd();
}
// ================================================================================================
@@ -1995,7 +2013,7 @@ void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd) {
cmd.pfnFreeFunc()(as_cl(cmd.queue()->asCommandQueue()), svmPointers.size(),
(void**)(&(svmPointers[0])), cmd.userData());
}
profilingEnd(cmd);
profilingEnd();
}
// ================================================================================================
@@ -2018,9 +2036,12 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
hsa_status_t status = hsa_amd_svm_prefetch_async(
const_cast<void*>(cmd.dev_ptr()), cmd.count(), agent,
wait_events.size(), wait_events.data(), active);
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
"HSA prefetch async dev_ptr=0x%zx, count=%d, wait_event=0x%zx, "
"completion_signal=0x%zx", const_cast<void*>(cmd.dev_ptr()), cmd.count(),
(wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle);
// Wait for the prefetch. Should skip wait, but may require extra tracking for kernel execution
if ((status != HSA_STATUS_SUCCESS) || !Barriers().WaitCurrent()) {
if ((status != HSA_STATUS_SUCCESS)) {
Barriers().ResetCurrentSignal();
LogError("hsa_amd_svm_prefetch_async failed");
cmd.setStatus(CL_INVALID_OPERATION);
@@ -2031,7 +2052,7 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
} else {
LogWarning("hsa_amd_svm_prefetch_async is ignored, because no HMM support");
}
profilingEnd(cmd);
profilingEnd();
}
// ================================================================================================
@@ -2145,7 +2166,7 @@ void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& cmd) {
cmd.OverrrideCommandType(copy_command_type_);
copy_command_type_ = 0;
}
profilingEnd(cmd);
profilingEnd();
}
// ================================================================================================
@@ -2227,7 +2248,7 @@ void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) {
// direct memcpy for FGS enabled system
amd::SvmBuffer::memFill(cmd.dst(), cmd.src(), cmd.srcSize(), 1);
}
profilingEnd(cmd);
profilingEnd();
}
// ================================================================================================
@@ -2385,7 +2406,7 @@ void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) {
cmd.destination().signalWrite(&dstDevMem->dev());
profilingEnd(cmd);
profilingEnd();
}
// ================================================================================================
@@ -2423,7 +2444,7 @@ void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) {
}
}
profilingEnd(cmd);
profilingEnd();
}
// ================================================================================================
@@ -2463,7 +2484,7 @@ void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) {
memory->clearUnmapInfo(cmd.svmPtr());
}
profilingEnd(cmd);
profilingEnd();
}
// ================================================================================================
@@ -2567,7 +2588,7 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& cmd) {
}
}
profilingEnd(cmd);
profilingEnd();
}
// ================================================================================================
@@ -2659,7 +2680,7 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& cmd) {
devMemory->clearUnmapInfo(cmd.mapPtr());
profilingEnd(cmd);
profilingEnd();
}
// ================================================================================================
@@ -2735,18 +2756,15 @@ void VirtualGPU::submitFillMemory(amd::FillMemoryCommand& cmd) {
bool force_blit = false;
if (amd::IS_HIP) {
constexpr uint32_t kManagedAlloc = (CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_ALLOC_HOST_PTR);
// In case of HMM, use blit kernel instead of CPU memcpy
if ((cmd.memory().getMemFlags() & kManagedAlloc) == kManagedAlloc) {
force_blit = true;
}
// Always use blit for memset for HIP.
force_blit = true;
}
if (!fillMemory(cmd.type(), &cmd.memory(), cmd.pattern(), cmd.patternSize(),
cmd.surface(), cmd.origin(), cmd.size(), force_blit)) {
cmd.setStatus(CL_INVALID_OPERATION);
}
profilingEnd(cmd);
profilingEnd();
}
// ================================================================================================
@@ -2825,7 +2843,7 @@ void VirtualGPU::submitStreamOperation(amd::StreamOperationCommand& cmd) {
} else {
ShouldNotReachHere();
}
profilingEnd(cmd);
profilingEnd();
}
// ================================================================================================
@@ -2838,8 +2856,9 @@ void VirtualGPU::submitBatchMemoryOperation(amd::BatchMemoryOperationCommand& cm
if (!result) {
LogError("submitBatchMemoryOperation failed!");
}
profilingEnd(cmd);
profilingEnd();
}
// ================================================================================================
void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) {
// Make sure VirtualGPU has an exclusive access to the resources
@@ -2850,7 +2869,7 @@ void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) {
// Find the amd::Memory object for virtual ptr. vcmd.ptr() is vaddr.
amd::Memory* vaddr_base_obj = amd::MemObjMap::FindVirtualMemObj(vcmd.ptr());
if (vaddr_base_obj == nullptr || !(vaddr_base_obj->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
profilingEnd(vcmd);
profilingEnd();
return;
}
@@ -2906,7 +2925,10 @@ void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) {
}
}
profilingEnd(vcmd);
// Since this is a memory operation, the HW event set for barrier packet
// may not encapsulate what the command wants to do. Hence clear the hw_event
constexpr bool kClearHwEvent = true;
profilingEnd(kClearHwEvent);
}
// ================================================================================================
@@ -2945,7 +2967,7 @@ void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) {
amd::SvmBuffer::memFill(cmd.dst(), cmd.pattern(), cmd.patternSize(), cmd.times());
}
profilingEnd(cmd);
profilingEnd();
}
// ================================================================================================
@@ -2976,7 +2998,7 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) {
}
}
profilingEnd(vcmd);
profilingEnd();
}
// ================================================================================================
@@ -3264,7 +3286,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
amd::Memory* const* memories =
reinterpret_cast<amd::Memory* const*>(parameters + kernelParams.memoryObjOffset());
bool isGraphCapture = currCmd_ != nullptr && currCmd_->getPktCapturingState();
bool isGraphCapture = command_ != nullptr && command_->getPktCapturingState();
for (int j = 0; j < iteration; j++) {
// Reset global size for dimension dim if split is needed
if (dim != -1) {
@@ -3485,9 +3507,9 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
if (!kernel.parameters().deviceKernelArgs() || gpuKernel.isInternalKernel()) {
// Allocate buffer to hold kernel arguments
if (isGraphCapture) {
argBuffer = currCmd_->getKernArgOffset(gpuKernel.KernargSegmentByteSize(),
argBuffer = command_->getKernArgOffset(gpuKernel.KernargSegmentByteSize(),
gpuKernel.KernargSegmentAlignment());
currCmd_->SetKernelName(gpuKernel.name());
command_->SetKernelName(gpuKernel.name());
} else {
ClPrint(amd::LOG_INFO, amd::LOG_KERN, "KernargSegmentByteSize = %lu "
"KernargSegmentAlignment = %lu", gpuKernel.KernargSegmentByteSize(),
@@ -3558,7 +3580,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
aqlHeaderWithOrder &= kAqlHeaderMask;
}
if (vcmd != nullptr && vcmd->getEventScope() == amd::Device::kCacheStateSystem) {
if (vcmd != nullptr && vcmd->getCommandEntryScope() == amd::Device::kCacheStateSystem) {
addSystemScope_ = true;
}
@@ -3576,8 +3598,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
// Dispatch the packet
if (!dispatchAqlPacket(&dispatchPacket, aqlHeaderWithOrder,
(sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS),
GPU_FLUSH_ON_EXECUTION, currCmd_->getPktCapturingState(),
currCmd_->getAqlPacket())) {
GPU_FLUSH_ON_EXECUTION, command_->getPktCapturingState(),
command_->getAqlPacket())) {
return false;
}
} else {
@@ -3676,7 +3698,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
hasPendingDispatch_ = true;
retainExternalSignals_ = true;
queue->profilingEnd(vcmd);
queue->profilingEnd();
} else {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
@@ -3690,7 +3712,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
vcmd.setStatus(CL_INVALID_OPERATION);
}
profilingEnd(vcmd);
profilingEnd();
}
}
@@ -3711,7 +3733,7 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
profilingBegin(vcmd);
if (timestamp_ != nullptr) {
const Settings& settings = dev().settings();
int32_t releaseFlags = vcmd.getEventScope();
int32_t releaseFlags = vcmd.getCommandEntryScope();
if (releaseFlags == Device::CacheState::kCacheStateIgnore) {
if (settings.barrier_value_packet_ && vcmd.profilingInfo().marker_ts_) {
dispatchBarrierValuePacket(kBarrierVendorPacketNopScopeHeader, true);
@@ -3728,7 +3750,7 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
hasPendingDispatch_ = false;
}
}
profilingEnd(vcmd);
profilingEnd();
}
}
@@ -3747,7 +3769,7 @@ void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) {
dispatchBarrierPacket(kNopPacketHeader, false);
}
profilingEnd(vcmd);
profilingEnd();
}
// ================================================================================================
@@ -3757,7 +3779,7 @@ void VirtualGPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd) {
profilingBegin(vcmd);
addSystemScope();
profilingEnd(vcmd);
profilingEnd();
}
// ================================================================================================
@@ -3765,7 +3787,7 @@ void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(vcmd);
profilingEnd(vcmd);
profilingEnd();
}
// ================================================================================================
+3 -3
Просмотреть файл
@@ -327,7 +327,7 @@ class VirtualGPU : public device::VirtualDevice {
const Device& dev() const { return roc_device_; }
void profilingBegin(amd::Command& command, bool sdmaProfiling = false);
void profilingEnd(amd::Command& command);
void profilingEnd(bool clearHwEvent = false);
void updateCommandsState(amd::Command* list) const;
@@ -428,6 +428,7 @@ class VirtualGPU : public device::VirtualDevice {
HwQueueTracker& Barriers() { return barriers_; }
Timestamp* timestamp() const { return timestamp_; }
amd::Command* command() const { return command_; }
void* allocKernArg(size_t size, size_t alignment);
bool isFenceDirty() const { return fence_dirty_; }
@@ -528,6 +529,7 @@ class VirtualGPU : public device::VirtualDevice {
};
Timestamp* timestamp_;
amd::Command* command_; //!< Current command
hsa_agent_t gpu_device_; //!< Physical device
hsa_queue_t* gpu_queue_; //!< Queue associated with a gpu
hsa_barrier_and_packet_t barrier_packet_;
@@ -579,7 +581,5 @@ class VirtualGPU : public device::VirtualDevice {
std::atomic<uint> lastUsedSdmaEngineMask_; //!< Last Used SDMA Engine mask
using KernelArgImpl = device::Settings::KernelArgImpl;
amd::Command* currCmd_ = nullptr; //!< Current command under capture
};
}
+4 -4
Просмотреть файл
@@ -42,8 +42,8 @@ Event::Event(HostQueue& queue, bool profilingEnabled)
hw_event_(nullptr),
notify_event_(nullptr),
device_(&queue.device()),
profilingInfo_(profilingEnabled),
event_scope_(Device::kCacheStateInvalid) {
profilingInfo_(profilingEnabled) {
event_entry_scope_.store(Device::kCacheStateInvalid, std::memory_order_relaxed);
notified_.clear();
}
@@ -53,8 +53,8 @@ Event::Event()
status_(CL_SUBMITTED),
hw_event_(nullptr),
notify_event_(nullptr),
device_(nullptr),
event_scope_(Device::kCacheStateInvalid) {
device_(nullptr) {
event_entry_scope_.store(Device::kCacheStateInvalid, std::memory_order_relaxed);
notified_.clear();
}
+12 -5
Просмотреть файл
@@ -88,10 +88,13 @@ class Event : public RuntimeObject {
std::atomic<CallBackEntry*> callbacks_; //!< linked list of callback entries.
std::atomic<int32_t> status_; //!< current execution status.
std::atomic_flag notified_; //!< Command queue was notified
void* hw_event_; //!< HW event ID associated with SW event
Event* notify_event_; //!< Notify event, which should contain HW signal
const Device* device_; //!< Device, this event associated with
int32_t event_scope_; //!< 2 - system scope, 1 - device scope,
std::atomic<int32_t> event_entry_scope_; //!< Command entry scope
//!< 2 - system scope, 1 - device scope,
//!< 0 - ignore, -1 - invalid
protected:
@@ -219,11 +222,15 @@ class Event : public RuntimeObject {
//! Returns notify even associated with the current command
Event* NotifyEvent() const { return notify_event_; }
//! Get release scope of the event
int32_t getEventScope() const { return event_scope_; }
//! Get entry scope of the event
int32_t getCommandEntryScope() const {
return event_entry_scope_.load(std::memory_order_relaxed);
}
//! Set release scope for the event
void setEventScope(int32_t scope) { event_scope_ = scope; }
//! Set entry scope for the event
void setCommandEntryScope(int32_t scope) {
event_entry_scope_.store(scope, std::memory_order_relaxed);
}
};
union CopyMetadata {
+18
Просмотреть файл
@@ -133,6 +133,24 @@ bool HostQueue::terminate() {
return true;
}
void HostQueue::finishCommand(Command* command) {
if (command == nullptr) {
command = getLastQueuedCommand(true);
if (command != nullptr) {
ClPrint(LOG_DEBUG, LOG_CMD, "No command, awaiting complete status on host");
command->awaitCompletion();
command->release();
}
return;
}
// Check hardware event status for the specific command
static constexpr bool kWaitCompletion = true;
if (!device().IsHwEventReady(command->event(), kWaitCompletion)) {
ClPrint(LOG_DEBUG, LOG_CMD, "No HW event, awaiting complete status on host");
command->awaitCompletion();
}
}
void HostQueue::finish(bool cpu_wait) {
Command* command = nullptr;
if (IS_HIP) {
+3
Просмотреть файл
@@ -233,6 +233,9 @@ class HostQueue : public CommandQueue {
//! Finish all queued commands
void finish(bool cpu_wait = false);
//! Wait until finish of one command
void finishCommand(Command* command);
//! Check if hostQueue empty snapshot
bool isEmpty();