SWDEV-538195 - Introduce threshold for handler submission (#723)

- When doing device/stream sync, we can submit a handler which may
  introduce some host side delays. Use DEBUG_CLR_BATCH_CPU_SYNC_SIZE to
  batch commands for host wait. Default for HIP is 8 commands.
- Investigation is underway in ROCr but need to address this for now in
  HIP runtime.
Этот коммит содержится в:
Kudchadker, Saleel
2025-08-06 20:34:42 -07:00
коммит произвёл GitHub
родитель cd46294b31
Коммит 9b045922a8
8 изменённых файлов: 40 добавлений и 6 удалений
+3
Просмотреть файл
@@ -32,6 +32,7 @@ namespace hip {
// ================================================================================================
hip::Stream* Device::NullStream(bool wait) {
ClPrint(amd::LOG_DEBUG, amd::LOG_WAIT, "NullStream %p, wait %d", null_stream_, wait);
if (null_stream_ == nullptr) {
amd::ScopedLock lock(lock_);
if (null_stream_ == nullptr) {
@@ -188,6 +189,7 @@ void Device::WaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stre
if (wait_null_stream) {
if (null_stream_) {
ClPrint(amd::LOG_DEBUG, amd::LOG_WAIT, "Waiting on nullstream %p", null_stream_);
waitForStream(null_stream_);
}
} else {
@@ -198,6 +200,7 @@ void Device::WaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stre
((active_stream->Flags() & hipStreamNonBlocking) == 0) &&
// and it's not the current stream
(active_stream != blocking_stream)) {
ClPrint(amd::LOG_DEBUG, amd::LOG_WAIT, "Waiting on active stream %p", active_stream);
// Get the last valid command
waitForStream(active_stream);
}
+1 -1
Просмотреть файл
@@ -674,7 +674,7 @@ hipError_t hipDeviceSynchronize() {
CHECK_SUPPORTED_DURING_CAPTURE();
constexpr bool kDoWaitForCpu = false;
hip::getCurrentDevice()->SyncAllStreams(kDoWaitForCpu);
HIP_RETURN(hipSuccess);
HIP_RETURN_DURATION(hipSuccess);
}
int ihipGetDevice() {
+4
Просмотреть файл
@@ -358,6 +358,8 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
}
}
// The hsa copy api would result in a dirty cache state
gpu().setFenceDirty(false);
return true;
}
@@ -573,6 +575,8 @@ inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent,
if (status == HSA_STATUS_SUCCESS) {
gpu().addSystemScope();
// The hsa copy api would result in a dirty cache state
gpu().setFenceDirty(false);
} else {
gpu().Barriers().ResetCurrentSignal();
LogPrintfError("HSA copy failed with code %d, falling to Blit copy", status);
+8 -1
Просмотреть файл
@@ -136,7 +136,7 @@ void Timestamp::checkGpuTime() {
amd::ScopedLock lock(it->LockSignalOps());
// Ignore the wait if runtime processes API callback, because the signal value is bigger
// than expected and the value reset will occur after API callback is done
if (GetCallbackSignal().handle == 0) {
if (GetCallbackSignal().handle == 0 || GetBlocking() == false) {
WaitForSignal(it->signal_);
}
// Avoid profiling data for the sync barrier, in tiny performance tests the first call
@@ -1006,6 +1006,7 @@ bool VirtualGPU::dispatchGenericAqlPacket(
// Check for queue full and wait if needed.
uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1);
uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
fence_dirty_ = true;
if (addSystemScope_) {
header &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE |
@@ -1018,6 +1019,12 @@ bool VirtualGPU::dispatchGenericAqlPacket(
auto expected_fence_state = extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
// Reset fence_dirty_ flag if we submit a packet with system scopes
if (expected_fence_state == amd::Device::kCacheStateSystem) {
fence_dirty_ = false;
}
// Dirty optimization to save on consequent dispatch packets which have requested flushes
if (fence_state_ == amd::Device::kCacheStateSystem
&& expected_fence_state == amd::Device::kCacheStateSystem) {
header = dispatchPacketHeader_;
+1
Просмотреть файл
@@ -449,6 +449,7 @@ class VirtualGPU : public device::VirtualDevice {
void* allocKernArg(size_t size, size_t alignment);
bool isFenceDirty() const { return fence_dirty_; }
void setFenceDirty(bool state) { fence_dirty_ = state; }
void HiddenHeapInit();
void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; }
+17 -3
Просмотреть файл
@@ -159,7 +159,11 @@ void HostQueue::finishCommand(Command* command) {
void HostQueue::finish(bool cpu_wait) {
Command* command = nullptr;
size_t minBatchSize = 0;
if (IS_HIP) {
minBatchSize = DEBUG_CLR_BATCH_CPU_SYNC_SIZE;
command = getLastQueuedCommand(true);
if (command == nullptr) {
return;
@@ -170,23 +174,33 @@ void HostQueue::finish(bool cpu_wait) {
cpu_wait = true;
}
}
size_t batchSize = GetSubmissionBatchSize();
ClPrint(LOG_DEBUG, LOG_CMD,
"finish() called with batch size: %zu, cpu_wait: %d, "
"fence dirty: %d",
batchSize, cpu_wait, vdev()->isFenceDirty());
// Force marker if the batch wasn't sent for CPU update or fence is dirty
if (nullptr == command || (GetSubmissionBatch() != nullptr) || vdev()->isFenceDirty()) {
if (nullptr != command) {
command->release();
}
const Command::EventWaitList nullWaitList = {};
// Send a finish to make sure we finished all commands
command = new Marker(*this, false);
command = new Marker(*this, false, nullWaitList, nullptr, batchSize < minBatchSize);
if (command == NULL) {
return;
}
ClPrint(LOG_DEBUG, LOG_CMD, "Marker queued to %p for finish", this);
command->enqueue();
}
// Check HW status of the ROCcrl event. Note: not all ROCclr modes support HW status
static constexpr bool kWaitCompletion = true;
if (cpu_wait || !device().IsHwEventReady(command->event(), kWaitCompletion)) {
ClPrint(LOG_DEBUG, LOG_CMD, "No HW event || cpu wait=%d, await command completion", cpu_wait);
ClPrint(LOG_DEBUG, LOG_CMD,
"No HW event or batch size is less than %zu, "
"await command completion",
minBatchSize);
command->awaitCompletion();
if (IS_HIP) {
+3
Просмотреть файл
@@ -251,6 +251,9 @@ class HostQueue : public CommandQueue {
//! Get the submitted batch
Command* GetSubmissionBatch() const { return head_; }
//! Get the current batch size
size_t GetSubmissionBatchSize() const { return size_; }
//! Insert a command into the linked list of submitted commands
void FormSubmissionBatch(Command* command) {
// Insert the command to the linked list.
+3 -1
Просмотреть файл
@@ -270,7 +270,9 @@ release(bool, DEBUG_HIP_DYNAMIC_QUEUES, false, \
release(uint, HIP_SKIP_ABORT_ON_GPU_ERROR, true, \
"Set this to true, to avoid host side abort for GPU errors") \
release(bool, HIP_FORCE_SPIRV_CODEOBJECT, false, \
"Force use of SPIRV instead of device specific code object.") \
"Force use of SPIRV instead of device specific code object.") \
release(uint, DEBUG_CLR_BATCH_CPU_SYNC_SIZE, 8, \
"Forces the minimum batch size for CPU sync") \
namespace amd {