SWDEV-508279 - Improve HIP event profiling

There are 2 functional changes to this patch:
* Use GPU timing for internal markers for HIP.
* Measure CPU time closer to GPU timer, to reduce delta between GPU/CPU timestamp measurements.

There are some smaller non-functional updates:
* waifForFence -> waitForFence typo
* Remove unused drmProfiling

Change-Id: I4c5fa600a842ab60e454888779edcac8449a902a


[ROCm/clr commit: 179801a750]
This commit is contained in:
Aidan Belton-Schure
2025-01-31 11:48:44 +00:00
committed by Aidan Belton-Schure
vanhempi 5725b99619
commit 4b4a35b86b
4 muutettua tiedostoa jossa 33 lisäystä ja 21 poistoa
+3 -1
Näytä tiedosto
@@ -224,7 +224,9 @@ hipError_t Event::recordCommand(amd::Command*& command, amd::HostQueue* stream,
releaseFlags = amd::Device::kCacheStateInvalid;
}
// Always submit a EventMarker.
command = new hip::EventMarker(*stream, !kMarkerDisableFlush, true, releaseFlags, batch_flush);
constexpr bool kMarkerTs = true;
command =
new hip::EventMarker(*stream, !kMarkerDisableFlush, kMarkerTs, releaseFlags, batch_flush);
}
return hipSuccess;
}
@@ -433,7 +433,7 @@ bool VirtualGPU::Queue::flush() {
// Make sure the slot isn't busy
constexpr bool IbReuse = true;
if (GPU_FLUSH_ON_EXECUTION) {
waifForFence<!IbReuse>(cmdBufIdSlot_);
waitForFence<!IbReuse>(cmdBufIdSlot_);
}
// Reset the counter of commands
@@ -444,7 +444,7 @@ bool VirtualGPU::Queue::flush() {
if (cmdBufIdCurrent_ == GpuEvent::InvalidID) {
// Wait for the last one
waifForFence<!IbReuse>(cmdBufIdSlot_);
waitForFence<!IbReuse>(cmdBufIdSlot_);
cmdBufIdCurrent_ = 1;
cmbBufIdRetired_ = 0;
}
@@ -452,7 +452,7 @@ bool VirtualGPU::Queue::flush() {
// Wrap current slot
cmdBufIdSlot_ = cmdBufIdCurrent_ % max_command_buffers_;
waifForFence<IbReuse>(cmdBufIdSlot_);
waitForFence<IbReuse>(cmdBufIdSlot_);
// Progress retired TS
if ((cmdBufIdCurrent_ > max_command_buffers_) &&
@@ -511,7 +511,7 @@ bool VirtualGPU::Queue::waitForEvent(uint id) {
uint slotId = id % max_command_buffers_;
constexpr bool IbReuse = true;
bool result = waifForFence<!IbReuse>(slotId);
bool result = waitForFence<!IbReuse>(slotId);
cmbBufIdRetired_ = id;
return result;
}
@@ -1170,7 +1170,7 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& vcmd) {
// Find if virtual address is a CL allocation
device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.destination(), &offset);
profilingBegin(vcmd, true);
profilingBegin(vcmd);
memory->syncCacheFromHost(*this);
cl_command_type type = vcmd.type();
@@ -1297,7 +1297,7 @@ void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& vcmd) {
// Find if virtual address is a CL allocation
device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.source(), &offset);
profilingBegin(vcmd, true);
profilingBegin(vcmd);
bool entire = vcmd.isEntireMemory();
@@ -1613,7 +1613,7 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& vcmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(vcmd, true);
profilingBegin(vcmd);
pal::Memory* memory = dev().getGpuMemory(&vcmd.memory());
@@ -1708,7 +1708,7 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& vcmd) {
LogError("Unmap without map call");
return;
}
profilingBegin(vcmd, true);
profilingBegin(vcmd);
// Check if image is a mipmap and assign a saved view
amdImage = owner->asImage();
@@ -1874,7 +1874,7 @@ void VirtualGPU::submitFillMemory(amd::FillMemoryCommand& cmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(cmd, true);
profilingBegin(cmd);
if (cmd.type() == CL_COMMAND_FILL_IMAGE) {
if (!fillMemory(cmd.type(), &cmd.memory(), cmd.pattern(), cmd.patternSize(),
cmd.origin(), cmd.size())) {
@@ -2064,7 +2064,7 @@ void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& vcmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(vcmd, true);
profilingBegin(vcmd);
// no op for FGS supported device
if (!dev().isFineGrainedSystem()) {
@@ -2103,7 +2103,7 @@ void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& vcmd) {
void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& vcmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(vcmd, true);
profilingBegin(vcmd);
// no op for FGS supported device
if (!dev().isFineGrainedSystem()) {
@@ -2139,7 +2139,7 @@ void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& vcmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(vcmd, true);
profilingBegin(vcmd);
if (!dev().isFineGrainedSystem()) {
size_t patternSize = vcmd.patternSize();
@@ -2171,7 +2171,7 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(vcmd, true);
profilingBegin(vcmd);
for (const auto& it : vcmd.memObjects()) {
// Find device memory
@@ -2855,6 +2855,17 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
if (!foundEvent) {
state_.forceWait_ = true;
}
} else if (amd::IS_HIP) {
// Use GPU based timing for HIP events
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
GpuEvent event;
profilingBegin(vcmd);
eventBegin(MainEngine);
eventEnd(MainEngine, event);
setGpuEvent(event);
profilingEnd(vcmd);
}
}
@@ -3361,6 +3372,8 @@ void VirtualGPU::waitEventLock(CommandBatch* cb) {
amd::ScopedLock lock(execution());
earlyDone = waitAllEngines(cb);
}
// Get timestamp, incase readjustTimeGPU_ needs to be updated
uint64_t endTimeStampCPU = amd::Os::timeNanos();
// Free resource cache if we have too many entries
//! \note we do it here, when all engines are idle,
@@ -3384,7 +3397,6 @@ void VirtualGPU::waitEventLock(CommandBatch* cb) {
// Get the timestamp value of the last command in the batch
cb->lastTS_->value(&startTimeStampGPU, &endTimeStampGPU);
uint64_t endTimeStampCPU = amd::Os::timeNanos();
// Adjust the base time by the execution time
readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU;
}
@@ -3413,7 +3425,7 @@ bool VirtualGPU::allocConstantBuffers() {
return true;
}
void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
void VirtualGPU::profilingBegin(amd::Command& command) {
// Is profiling enabled?
if (command.profilingInfo().enabled_) {
// Allocate a timestamp object from the cache
@@ -137,7 +137,7 @@ class VirtualGPU : public device::VirtualDevice {
Pal::Result UpdateAppPowerProfile();
// ibReuse forces event wait without polling, to make sure event occured
template <bool ibReuse> bool waifForFence(uint cbId) const {
template <bool ibReuse> bool waitForFence(uint cbId) const {
Pal::Result result = Pal::Result::Success;
uint64_t start;
uint64_t end;
@@ -394,9 +394,7 @@ class VirtualGPU : public device::VirtualDevice {
void addConstBuffer(ConstantBuffer* cb) { constBufs_.push_back(cb); }
//! Start the command profiling
void profilingBegin(amd::Command& command, //!< Command queue object
bool drmProfiling = false //!< Measure DRM time
);
void profilingBegin(amd::Command& command); //!< Command queue object
//! End the command profiling
void profilingEnd(amd::Command& command);
@@ -251,7 +251,7 @@ void HostQueue::loop(device::VirtualDevice* virtualDevice) {
// Submit to the device queue.
command->submit(*virtualDevice);
// if this is a user invisible marker command, then flush
// if this is a user invisible marker with a waiting event, then flush
if (0 == command->type()) {
virtualDevice->flush(head);
tail = head = NULL;