Correct computation of scratch slot requirements.
Each SE must be assigned equal numbers of slots and slots must be assigned in units of whole groups. Change-Id: I8f3677237fa6f2e2d25e3e78210c5a7a0ad792f3
此提交包含在:
@@ -93,6 +93,7 @@ class ScratchCache {
|
||||
size_t dispatch_size;
|
||||
size_t size_per_thread;
|
||||
uint32_t lanes_per_wave;
|
||||
uint32_t waves_per_group;
|
||||
ptrdiff_t queue_process_offset;
|
||||
bool large;
|
||||
bool retry;
|
||||
|
||||
@@ -798,6 +798,8 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
|
||||
pkt.dispatch.workgroup_size_z;
|
||||
uint64_t waves_per_group =
|
||||
(lanes_per_group + scratch.lanes_per_wave - 1) / scratch.lanes_per_wave;
|
||||
scratch.waves_per_group = waves_per_group;
|
||||
|
||||
uint64_t groups = ((uint64_t(pkt.dispatch.grid_size_x) + pkt.dispatch.workgroup_size_x - 1) /
|
||||
pkt.dispatch.workgroup_size_x) *
|
||||
((uint64_t(pkt.dispatch.grid_size_y) + pkt.dispatch.workgroup_size_y - 1) /
|
||||
@@ -805,10 +807,11 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
|
||||
((uint64_t(pkt.dispatch.grid_size_z) + pkt.dispatch.workgroup_size_z - 1) /
|
||||
pkt.dispatch.workgroup_size_z);
|
||||
|
||||
// Assign an equal number of groups to each engine, clipping to capacity limits
|
||||
const uint32_t engines = queue->agent_->properties().NumShaderBanks;
|
||||
groups = ((groups + engines - 1) / engines) * engines;
|
||||
scratch.wanted_slots = groups * waves_per_group;
|
||||
scratch.wanted_slots = Min(scratch.wanted_slots, uint64_t(MaxScratchSlots));
|
||||
scratch.wanted_slots =
|
||||
Max(scratch.wanted_slots, uint64_t(queue->agent_->properties().NumShaderBanks));
|
||||
scratch.dispatch_size =
|
||||
scratch.size_per_thread * scratch.wanted_slots * scratch.lanes_per_wave;
|
||||
|
||||
|
||||
@@ -1185,7 +1185,7 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) {
|
||||
return;
|
||||
}
|
||||
scratch_pool_.free(base);
|
||||
waves_per_cu--;
|
||||
waves_per_cu = waves_per_cu - scratch.waves_per_group;
|
||||
}
|
||||
|
||||
// Failed to allocate minimal scratch
|
||||
|
||||
新增問題並參考
封鎖使用者