Refactor scratch handler function

Separate the event handler and scratch handler portions of the code into
separate functions.

Change-Id: Ifdb7461e816b0f2d3c1c0a74d6f020b4d6fc736c
This commit is contained in:
David Yat Sin
2023-11-15 17:24:09 +00:00
parent fa317f8c41
commit 64070a9acc
2 changed files with 140 additions and 115 deletions
+5 -1
View File
@@ -239,9 +239,13 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo
/// @brief Halt the queue without destroying it or fencing memory.
void Suspend();
/// @brief Handle insufficient scratch
void HandleInsufficientScratch(hsa_signal_value_t& error_code, hsa_signal_value_t& waitVal,
bool& changeWait);
/// @brief Handler for hardware queue events.
template <bool HandleExceptions>
static bool DynamicScratchHandler(hsa_signal_value_t error_code, void* arg);
static bool DynamicQueueEventsHandler(hsa_signal_value_t error_code, void* arg);
/// @brief Handler for KFD exceptions.
static bool ExceptionHandler(hsa_signal_value_t error_code, void* arg);
+135 -114
View File
@@ -292,7 +292,7 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr
if (core::Runtime::runtime_singleton_->KfdVersion().supports_exception_debugging) {
if (AMD::hsa_amd_signal_async_handler(amd_queue_.queue_inactive_signal, HSA_SIGNAL_CONDITION_NE,
0, DynamicScratchHandler<false>,
0, DynamicQueueEventsHandler<false>,
this) != HSA_STATUS_SUCCESS)
throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES,
"Queue event handler failed registration.\n");
@@ -303,7 +303,7 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr
"Queue event handler failed registration.\n");
} else {
if (AMD::hsa_amd_signal_async_handler(amd_queue_.queue_inactive_signal, HSA_SIGNAL_CONDITION_NE,
0, DynamicScratchHandler<true>,
0, DynamicQueueEventsHandler<true>,
this) != HSA_STATUS_SUCCESS)
throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES,
"Queue event handler failed registration.\n");
@@ -778,8 +778,135 @@ hsa_status_t AqlQueue::SetPriority(HSA_QUEUE_PRIORITY priority) {
return (err == HSAKMT_STATUS_SUCCESS ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR_OUT_OF_RESOURCES);
}
void AqlQueue::HandleInsufficientScratch(hsa_signal_value_t& error_code,
hsa_signal_value_t& waitVal, bool& changeWait) {
// Insufficient scratch - recoverable, don't process dynamic scratch if errors are present.
auto& scratch = queue_scratch_;
/*******************************************************************************************
* uint32_t max_scratch_slots; // Maximum number of slots for this device based on num CUs
* uint64_t dispatch_slots; // Number of slots wanted for this dispatch
*
* uint64_t all_slots_size; // Size needed to fill all slots on this device
* uint64_t dispatch_size; // Size needed to fill wanted slots for this dispatch
*
*******************************************************************************************/
auto calc_dispatch_waves_per_group = [&](core::AqlPacket& pkt) {
const uint64_t lanes_per_group =
(uint64_t(pkt.dispatch.workgroup_size_x) * pkt.dispatch.workgroup_size_y) *
pkt.dispatch.workgroup_size_z;
const uint32_t lanes_per_wave = (error_code & 0x400) ? 32 : 64;
return (lanes_per_group + lanes_per_wave - 1) / lanes_per_wave;
};
auto calc_dispatch_groups = [&](core::AqlPacket& pkt) {
const uint64_t lanes_per_group =
(uint64_t(pkt.dispatch.workgroup_size_x) * pkt.dispatch.workgroup_size_y) *
pkt.dispatch.workgroup_size_z;
uint64_t groups = ((uint64_t(pkt.dispatch.grid_size_x) + pkt.dispatch.workgroup_size_x - 1) /
pkt.dispatch.workgroup_size_x) *
((uint64_t(pkt.dispatch.grid_size_y) + pkt.dispatch.workgroup_size_y - 1) /
pkt.dispatch.workgroup_size_y) *
((uint64_t(pkt.dispatch.grid_size_z) + pkt.dispatch.workgroup_size_z - 1) /
pkt.dispatch.workgroup_size_z);
const uint32_t cu_count = amd_queue_.max_cu_id + 1;
const uint32_t engines = agent_->properties().NumShaderBanks;
const uint32_t symmetric_cus = AlignDown(cu_count, engines);
const uint32_t asymmetryPerRound = cu_count - symmetric_cus;
const uint64_t rounds = groups / cu_count;
const uint64_t asymmetricGroups = rounds * asymmetryPerRound;
const uint64_t symmetricGroups = groups - asymmetricGroups;
uint64_t maxGroupsPerEngine =
((symmetricGroups + engines - 1) / engines) + (asymmetryPerRound ? rounds : 0);
// For gfx10+ devices we must attempt to assign the smaller of 256 lanes or 16 groups to each
// engine.
if (agent_->isa()->GetMajorVersion() >= 10 && maxGroupsPerEngine < 16 &&
lanes_per_group * maxGroupsPerEngine < 256) {
uint64_t groups_per_interleave = (256 + lanes_per_group - 1) / lanes_per_group;
maxGroupsPerEngine = Min(groups_per_interleave, 16ul);
}
// Populate all engines at max group occupancy, then clip down to device limits.
return maxGroupsPerEngine * engines;
};
// TODO: Move this to queue constructor since it does not depend on pkt, must be re-computed if
// CU Masking is enabled
auto calc_device_slots = [&]() {
// Get the hw maximum scratch slot count taking into consideration asymmetric harvest.
const uint32_t engines = agent_->properties().NumShaderBanks;
const uint32_t cu_count = amd_queue_.max_cu_id + 1;
return AlignUp(cu_count, engines) * agent_->properties().MaxSlotsScratchCU;
};
scratch.use_once_limit = core::Runtime::runtime_singleton_->flag().scratch_single_limit();
scratch.cooperative = (amd_queue_.hsa_queue.type == HSA_QUEUE_TYPE_COOPERATIVE);
uint64_t pkt_slot_idx = amd_queue_.read_dispatch_id & (amd_queue_.hsa_queue.size - 1);
core::AqlPacket& pkt = ((core::AqlPacket*)amd_queue_.hsa_queue.base_address)[pkt_slot_idx];
pkt.AssertIsDispatchAndNeedsScratch();
uint32_t device_slots = calc_device_slots();
uint32_t groups = calc_dispatch_groups(pkt);
uint32_t waves_per_group = calc_dispatch_waves_per_group(pkt);
uint32_t dispatch_slots = groups * waves_per_group;
dispatch_slots = std::min(dispatch_slots, device_slots);
const uint64_t lanes_per_wave = (error_code & 0x400) ? 32 : 64;
uint64_t device_size = pkt.dispatch.private_segment_size * lanes_per_wave * device_slots;
uint64_t dispatch_size = pkt.dispatch.private_segment_size * lanes_per_wave * dispatch_slots;
agent_->ReleaseQueueMainScratch(scratch);
scratch.main_size = device_size;
scratch.main_size_per_thread = pkt.dispatch.private_segment_size;
scratch.main_lanes_per_wave = lanes_per_wave;
scratch.main_waves_per_group = waves_per_group;
scratch.dispatch_size = dispatch_size;
scratch.dispatch_slots = dispatch_slots;
agent_->AcquireQueueMainScratch(scratch);
if (scratch.retry) {
dynamicScratchState |= ERROR_HANDLER_SCRATCH_RETRY;
changeWait = true;
waitVal = error_code;
} else if (scratch.main_queue_base == nullptr) {
// We could not allocate memory to fit even 1 wave
return;
}
// If we had to reduce number of waves
if (scratch.large) {
amd_queue_.queue_properties |= AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE;
// Set system release fence to flush scratch stores with older firmware versions.
if ((agent_->isa()->GetMajorVersion() == 8) && (agent_->GetMicrocodeVersion() < 729)) {
pkt.dispatch.header &= ~(((1 << HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE) - 1)
<< HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
pkt.dispatch.header |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
}
}
// Reset scratch memory related entities for the queue
InitScratchSRD();
// Restart the queue.
HSA::hsa_signal_store_screlease(amd_queue_.queue_inactive_signal, 0);
return;
}
template <bool HandleExceptions>
bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
bool AqlQueue::DynamicQueueEventsHandler(hsa_signal_value_t error_code, void* arg) {
AqlQueue* queue = (AqlQueue*)arg;
hsa_status_t errorCode = HSA_STATUS_SUCCESS;
bool fatal = false;
@@ -816,118 +943,12 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
// Process only one queue error.
if (error_code & 0x401) { // insufficient scratch, wave64 or wave32
// Insufficient scratch - recoverable, don't process dynamic scratch if errors are present.
auto& scratch = queue->queue_scratch_;
queue->HandleInsufficientScratch(error_code, waitVal, changeWait);
queue->agent_->ReleaseQueueMainScratch(scratch);
// Out of scratch - promote error
if (queue->queue_scratch_.main_queue_base == nullptr)
errorCode = HSA_STATUS_ERROR_OUT_OF_RESOURCES;
uint64_t pkt_slot_idx =
queue->amd_queue_.read_dispatch_id & (queue->amd_queue_.hsa_queue.size - 1);
core::AqlPacket& pkt =
((core::AqlPacket*)queue->amd_queue_.hsa_queue.base_address)[pkt_slot_idx];
// Load the packet header as atomic acquire as it it written by another
// thread as atomic release. This ensures the rest of the packet fields
// are visible.
uint16_t pkt_header = atomic::Load(&pkt.packet.header, std::memory_order_acquire);
assert(core::AqlPacket::IsValid(pkt_header) && "Invalid packet in dynamic scratch handler.");
assert(core::AqlPacket::type(pkt_header) == HSA_PACKET_TYPE_KERNEL_DISPATCH &&
"Invalid packet in dynamic scratch handler.");
assert((pkt.dispatch.workgroup_size_x != 0) && (pkt.dispatch.workgroup_size_y != 0) &&
(pkt.dispatch.workgroup_size_z != 0) && "Invalid dispatch dimension.");
uint32_t scratch_request = pkt.dispatch.private_segment_size;
assert((scratch_request != 0) &&
"Scratch memory request from packet with no scratch demand. Possible bad kernel code object.");
// Get the hw maximum scratch slot count taking into consideration asymmetric harvest.
const uint32_t engines = queue->agent_->properties().NumShaderBanks;
const uint32_t cu_count = queue->amd_queue_.max_cu_id + 1;
const uint32_t MaxScratchSlots =
AlignUp(cu_count, engines) * queue->agent_->properties().MaxSlotsScratchCU;
scratch.main_size_per_thread = scratch_request;
scratch.main_lanes_per_wave = (error_code & 0x400) ? 32 : 64;
scratch.main_size_per_thread = AlignUp(
scratch.main_size_per_thread, scratch.mem_alignment_size / scratch.main_lanes_per_wave);
scratch.main_size =
scratch.main_size_per_thread * MaxScratchSlots * scratch.main_lanes_per_wave;
// Smaller dispatches may not need to reach full device occupancy.
// For these we need to ensure that the scratch we give doesn't restrict the dispatch even
// though it does not fill the device. Figure the total requested dispatch size.
uint64_t lanes_per_group =
(uint64_t(pkt.dispatch.workgroup_size_x) * pkt.dispatch.workgroup_size_y) *
pkt.dispatch.workgroup_size_z;
uint64_t waves_per_group =
(lanes_per_group + scratch.main_lanes_per_wave - 1) / scratch.main_lanes_per_wave;
scratch.main_waves_per_group = waves_per_group;
uint64_t groups = ((uint64_t(pkt.dispatch.grid_size_x) + pkt.dispatch.workgroup_size_x - 1) /
pkt.dispatch.workgroup_size_x) *
((uint64_t(pkt.dispatch.grid_size_y) + pkt.dispatch.workgroup_size_y - 1) /
pkt.dispatch.workgroup_size_y) *
((uint64_t(pkt.dispatch.grid_size_z) + pkt.dispatch.workgroup_size_z - 1) /
pkt.dispatch.workgroup_size_z);
// Find the maximum number of groups assigned to any engine.
const uint32_t symmetric_cus = AlignDown(cu_count, engines);
const uint32_t asymmetryPerRound = cu_count - symmetric_cus;
const uint64_t rounds = groups / cu_count;
const uint64_t asymmetricGroups = rounds * asymmetryPerRound;
const uint64_t symmetricGroups = groups - asymmetricGroups;
uint64_t maxGroupsPerEngine =
((symmetricGroups + engines - 1) / engines) + (asymmetryPerRound ? rounds : 0);
// For gfx10+ devices we must attempt to assign the smaller of 256 lanes or 16 groups to each
// engine.
if (queue->agent_->isa()->GetMajorVersion() >= 10 && maxGroupsPerEngine < 16 &&
lanes_per_group * maxGroupsPerEngine < 256) {
uint64_t groups_per_interleave = (256 + lanes_per_group - 1) / lanes_per_group;
maxGroupsPerEngine = Min(groups_per_interleave, 16ul);
}
// Populate all engines at max group occupancy, then clip down to device limits.
groups = maxGroupsPerEngine * engines;
scratch.dispatch_slots = groups * waves_per_group;
scratch.dispatch_slots = Min(scratch.dispatch_slots, uint64_t(MaxScratchSlots));
scratch.dispatch_size =
scratch.main_size_per_thread * scratch.dispatch_slots * scratch.main_lanes_per_wave;
scratch.cooperative = (queue->amd_queue_.hsa_queue.type == HSA_QUEUE_TYPE_COOPERATIVE);
queue->agent_->AcquireQueueMainScratch(scratch);
if (scratch.retry) {
queue->dynamicScratchState |= ERROR_HANDLER_SCRATCH_RETRY;
changeWait = true;
waitVal = error_code;
} else {
// Out of scratch - promote error
if (scratch.main_queue_base == nullptr) {
errorCode = HSA_STATUS_ERROR_OUT_OF_RESOURCES;
} else {
// Mark large scratch allocation for single use.
if (scratch.large) {
queue->amd_queue_.queue_properties |= AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE;
// Set system release fence to flush scratch stores with older firmware versions.
if ((queue->agent_->isa()->GetMajorVersion() == 8) &&
(queue->agent_->GetMicrocodeVersion() < 729)) {
pkt.dispatch.header &= ~(((1 << HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE) - 1)
<< HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
pkt.dispatch.header |=
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
}
}
// Reset scratch memory related entities for the queue
queue->InitScratchSRD();
// Restart the queue.
HSA::hsa_signal_store_screlease(queue->amd_queue_.queue_inactive_signal, 0);
}
}
} else if (HandleExceptions) {
if ((error_code & 2) == 2) { // Invalid dim
@@ -973,7 +994,7 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
if (changeWait) {
core::Runtime::runtime_singleton_->SetAsyncSignalHandler(
queue->amd_queue_.queue_inactive_signal, HSA_SIGNAL_CONDITION_NE, waitVal,
DynamicScratchHandler<HandleExceptions>, queue);
DynamicQueueEventsHandler<HandleExceptions>, queue);
return false;
}
return true;