SWDEV-538195 - Introduce threshold for handler submission (#723)
- When doing device/stream sync, we can submit a handler which may introduce some host side delays. Use DEBUG_CLR_BATCH_CPU_SYNC_SIZE to batch commands for host wait. Default for HIP is 8 commands. - Investigation is underway in ROCr but need to address this for now in HIP runtime.
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
cd46294b31
Коммит
9b045922a8
@@ -32,6 +32,7 @@ namespace hip {
|
||||
|
||||
// ================================================================================================
|
||||
hip::Stream* Device::NullStream(bool wait) {
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_WAIT, "NullStream %p, wait %d", null_stream_, wait);
|
||||
if (null_stream_ == nullptr) {
|
||||
amd::ScopedLock lock(lock_);
|
||||
if (null_stream_ == nullptr) {
|
||||
@@ -188,6 +189,7 @@ void Device::WaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stre
|
||||
|
||||
if (wait_null_stream) {
|
||||
if (null_stream_) {
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_WAIT, "Waiting on nullstream %p", null_stream_);
|
||||
waitForStream(null_stream_);
|
||||
}
|
||||
} else {
|
||||
@@ -198,6 +200,7 @@ void Device::WaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stre
|
||||
((active_stream->Flags() & hipStreamNonBlocking) == 0) &&
|
||||
// and it's not the current stream
|
||||
(active_stream != blocking_stream)) {
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_WAIT, "Waiting on active stream %p", active_stream);
|
||||
// Get the last valid command
|
||||
waitForStream(active_stream);
|
||||
}
|
||||
|
||||
@@ -674,7 +674,7 @@ hipError_t hipDeviceSynchronize() {
|
||||
CHECK_SUPPORTED_DURING_CAPTURE();
|
||||
constexpr bool kDoWaitForCpu = false;
|
||||
hip::getCurrentDevice()->SyncAllStreams(kDoWaitForCpu);
|
||||
HIP_RETURN(hipSuccess);
|
||||
HIP_RETURN_DURATION(hipSuccess);
|
||||
}
|
||||
|
||||
int ihipGetDevice() {
|
||||
|
||||
@@ -358,6 +358,8 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
|
||||
}
|
||||
}
|
||||
|
||||
// The hsa copy api would result in a dirty cache state
|
||||
gpu().setFenceDirty(false);
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -573,6 +575,8 @@ inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent,
|
||||
|
||||
if (status == HSA_STATUS_SUCCESS) {
|
||||
gpu().addSystemScope();
|
||||
// The hsa copy api would result in a dirty cache state
|
||||
gpu().setFenceDirty(false);
|
||||
} else {
|
||||
gpu().Barriers().ResetCurrentSignal();
|
||||
LogPrintfError("HSA copy failed with code %d, falling to Blit copy", status);
|
||||
|
||||
@@ -136,7 +136,7 @@ void Timestamp::checkGpuTime() {
|
||||
amd::ScopedLock lock(it->LockSignalOps());
|
||||
// Ignore the wait if runtime processes API callback, because the signal value is bigger
|
||||
// than expected and the value reset will occur after API callback is done
|
||||
if (GetCallbackSignal().handle == 0) {
|
||||
if (GetCallbackSignal().handle == 0 || GetBlocking() == false) {
|
||||
WaitForSignal(it->signal_);
|
||||
}
|
||||
// Avoid profiling data for the sync barrier, in tiny performance tests the first call
|
||||
@@ -1006,6 +1006,7 @@ bool VirtualGPU::dispatchGenericAqlPacket(
|
||||
// Check for queue full and wait if needed.
|
||||
uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1);
|
||||
uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
|
||||
fence_dirty_ = true;
|
||||
|
||||
if (addSystemScope_) {
|
||||
header &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE |
|
||||
@@ -1018,6 +1019,12 @@ bool VirtualGPU::dispatchGenericAqlPacket(
|
||||
auto expected_fence_state = extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
|
||||
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
|
||||
|
||||
// Reset fence_dirty_ flag if we submit a packet with system scopes
|
||||
if (expected_fence_state == amd::Device::kCacheStateSystem) {
|
||||
fence_dirty_ = false;
|
||||
}
|
||||
|
||||
// Dirty optimization to save on consequent dispatch packets which have requested flushes
|
||||
if (fence_state_ == amd::Device::kCacheStateSystem
|
||||
&& expected_fence_state == amd::Device::kCacheStateSystem) {
|
||||
header = dispatchPacketHeader_;
|
||||
|
||||
@@ -449,6 +449,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
|
||||
void* allocKernArg(size_t size, size_t alignment);
|
||||
bool isFenceDirty() const { return fence_dirty_; }
|
||||
void setFenceDirty(bool state) { fence_dirty_ = state; }
|
||||
void HiddenHeapInit();
|
||||
|
||||
void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; }
|
||||
|
||||
@@ -159,7 +159,11 @@ void HostQueue::finishCommand(Command* command) {
|
||||
|
||||
void HostQueue::finish(bool cpu_wait) {
|
||||
Command* command = nullptr;
|
||||
size_t minBatchSize = 0;
|
||||
|
||||
if (IS_HIP) {
|
||||
minBatchSize = DEBUG_CLR_BATCH_CPU_SYNC_SIZE;
|
||||
|
||||
command = getLastQueuedCommand(true);
|
||||
if (command == nullptr) {
|
||||
return;
|
||||
@@ -170,23 +174,33 @@ void HostQueue::finish(bool cpu_wait) {
|
||||
cpu_wait = true;
|
||||
}
|
||||
}
|
||||
|
||||
size_t batchSize = GetSubmissionBatchSize();
|
||||
ClPrint(LOG_DEBUG, LOG_CMD,
|
||||
"finish() called with batch size: %zu, cpu_wait: %d, "
|
||||
"fence dirty: %d",
|
||||
batchSize, cpu_wait, vdev()->isFenceDirty());
|
||||
|
||||
// Force marker if the batch wasn't sent for CPU update or fence is dirty
|
||||
if (nullptr == command || (GetSubmissionBatch() != nullptr) || vdev()->isFenceDirty()) {
|
||||
if (nullptr != command) {
|
||||
command->release();
|
||||
}
|
||||
const Command::EventWaitList nullWaitList = {};
|
||||
// Send a finish to make sure we finished all commands
|
||||
command = new Marker(*this, false);
|
||||
command = new Marker(*this, false, nullWaitList, nullptr, batchSize < minBatchSize);
|
||||
if (command == NULL) {
|
||||
return;
|
||||
}
|
||||
ClPrint(LOG_DEBUG, LOG_CMD, "Marker queued to %p for finish", this);
|
||||
command->enqueue();
|
||||
}
|
||||
// Check HW status of the ROCcrl event. Note: not all ROCclr modes support HW status
|
||||
static constexpr bool kWaitCompletion = true;
|
||||
if (cpu_wait || !device().IsHwEventReady(command->event(), kWaitCompletion)) {
|
||||
ClPrint(LOG_DEBUG, LOG_CMD, "No HW event || cpu wait=%d, await command completion", cpu_wait);
|
||||
ClPrint(LOG_DEBUG, LOG_CMD,
|
||||
"No HW event or batch size is less than %zu, "
|
||||
"await command completion",
|
||||
minBatchSize);
|
||||
command->awaitCompletion();
|
||||
|
||||
if (IS_HIP) {
|
||||
|
||||
@@ -251,6 +251,9 @@ class HostQueue : public CommandQueue {
|
||||
//! Get the submitted batch
|
||||
Command* GetSubmissionBatch() const { return head_; }
|
||||
|
||||
//! Get the current batch size
|
||||
size_t GetSubmissionBatchSize() const { return size_; }
|
||||
|
||||
//! Insert a command into the linked list of submitted commands
|
||||
void FormSubmissionBatch(Command* command) {
|
||||
// Insert the command to the linked list.
|
||||
|
||||
@@ -270,7 +270,9 @@ release(bool, DEBUG_HIP_DYNAMIC_QUEUES, false, \
|
||||
release(uint, HIP_SKIP_ABORT_ON_GPU_ERROR, true, \
|
||||
"Set this to true, to avoid host side abort for GPU errors") \
|
||||
release(bool, HIP_FORCE_SPIRV_CODEOBJECT, false, \
|
||||
"Force use of SPIRV instead of device specific code object.") \
|
||||
"Force use of SPIRV instead of device specific code object.") \
|
||||
release(uint, DEBUG_CLR_BATCH_CPU_SYNC_SIZE, 8, \
|
||||
"Forces the minimum batch size for CPU sync") \
|
||||
|
||||
namespace amd {
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user