Correct computation of scratch slot requirements.

Each SE must be assigned equal numbers of slots and slots
must be assigned in units of whole groups.

Change-Id: I8f3677237fa6f2e2d25e3e78210c5a7a0ad792f3
此提交包含在:
Sean Keely
2021-01-08 16:19:55 -06:00
父節點 9fe8ccc3ee
當前提交 7bc6aac5d2
共有 3 個檔案被更改,包括 7 行新增3 行删除
+1
查看文件
@@ -93,6 +93,7 @@ class ScratchCache {
size_t dispatch_size;
size_t size_per_thread;
uint32_t lanes_per_wave;
uint32_t waves_per_group;
ptrdiff_t queue_process_offset;
bool large;
bool retry;
+5 -2
查看文件
@@ -798,6 +798,8 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
pkt.dispatch.workgroup_size_z;
uint64_t waves_per_group =
(lanes_per_group + scratch.lanes_per_wave - 1) / scratch.lanes_per_wave;
scratch.waves_per_group = waves_per_group;
uint64_t groups = ((uint64_t(pkt.dispatch.grid_size_x) + pkt.dispatch.workgroup_size_x - 1) /
pkt.dispatch.workgroup_size_x) *
((uint64_t(pkt.dispatch.grid_size_y) + pkt.dispatch.workgroup_size_y - 1) /
@@ -805,10 +807,11 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
((uint64_t(pkt.dispatch.grid_size_z) + pkt.dispatch.workgroup_size_z - 1) /
pkt.dispatch.workgroup_size_z);
// Assign an equal number of groups to each engine, clipping to capacity limits
const uint32_t engines = queue->agent_->properties().NumShaderBanks;
groups = ((groups + engines - 1) / engines) * engines;
scratch.wanted_slots = groups * waves_per_group;
scratch.wanted_slots = Min(scratch.wanted_slots, uint64_t(MaxScratchSlots));
scratch.wanted_slots =
Max(scratch.wanted_slots, uint64_t(queue->agent_->properties().NumShaderBanks));
scratch.dispatch_size =
scratch.size_per_thread * scratch.wanted_slots * scratch.lanes_per_wave;
+1 -1
查看文件
@@ -1185,7 +1185,7 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) {
return;
}
scratch_pool_.free(base);
waves_per_cu--;
waves_per_cu = waves_per_cu - scratch.waves_per_group;
}
// Failed to allocate minimal scratch