Correct scratch allocation logic to account for asymmetric harvest.

With asym. harvest hw does not issue groups equally to each SE,
occasionally hw will skip an SE so that the distribution reflects
each SE's CU count.  Scratch resources must be allocated to reflect
this asymmetric distribution of groups.

Change-Id: I65e26206500483ea18e6e8796e65ecba5354b029
このコミットが含まれているのは:
Sean Keely
2022-02-28 21:03:49 -06:00
コミット 552dcead93
2個のファイルの変更19行の追加6行の削除
+18 -4
ファイルの表示
@@ -831,8 +831,11 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
assert((scratch_request != 0) &&
"Scratch memory request from packet with no scratch demand. Possible bad kernel code object.");
// Get the hw maximum scratch slot count taking into consideration asymmetric harvest.
const uint32_t engines = queue->agent_->properties().NumShaderBanks;
const uint32_t cu_count = queue->amd_queue_.max_cu_id + 1;
const uint32_t MaxScratchSlots =
(queue->amd_queue_.max_cu_id + 1) * queue->agent_->properties().MaxSlotsScratchCU;
AlignUp(cu_count, engines) * queue->agent_->properties().MaxSlotsScratchCU;
scratch.size_per_thread = scratch_request;
scratch.lanes_per_wave = (error_code & 0x400) ? 32 : 64;
@@ -840,6 +843,9 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
scratch.size_per_thread = AlignUp(scratch.size_per_thread, 1024 / scratch.lanes_per_wave);
scratch.size = scratch.size_per_thread * MaxScratchSlots * scratch.lanes_per_wave;
// Smaller dispatches may not need to reach full device occupancy.
// For these we need to ensure that the scratch we give doesn't restrict the dispatch even
// though it does not fill the device. Figure the total requested dispatch size.
uint64_t lanes_per_group =
(uint64_t(pkt.dispatch.workgroup_size_x) * pkt.dispatch.workgroup_size_y) *
pkt.dispatch.workgroup_size_z;
@@ -854,9 +860,17 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
((uint64_t(pkt.dispatch.grid_size_z) + pkt.dispatch.workgroup_size_z - 1) /
pkt.dispatch.workgroup_size_z);
// Assign an equal number of groups to each engine, clipping to capacity limits
const uint32_t engines = queue->agent_->properties().NumShaderBanks;
groups = ((groups + engines - 1) / engines) * engines;
// Find the maximum number of groups assigned to any engine.
const uint32_t symmetric_cus = AlignDown(cu_count, engines);
const uint32_t asymmetryPerRound = cu_count - symmetric_cus;
const uint64_t rounds = groups / cu_count;
const uint64_t asymmetricGroups = rounds * asymmetryPerRound;
const uint64_t symmetricGroups = groups - asymmetricGroups;
const uint64_t maxGroupsPerEngine =
((symmetricGroups + engines - 1) / engines) + (asymmetryPerRound ? rounds : 0);
// Populate all engines at max group occupancy, then clip down to device limits.
groups = maxGroupsPerEngine * engines;
scratch.wanted_slots = groups * waves_per_group;
scratch.wanted_slots = Min(scratch.wanted_slots, uint64_t(MaxScratchSlots));
scratch.dispatch_size =
+1 -2
ファイルの表示
@@ -237,8 +237,7 @@ static __forceinline bool IsPowerOfTwo(T val) {
/// @return: T.
template <typename T>
static __forceinline T AlignDown(T value, size_t alignment) {
assert(IsPowerOfTwo(alignment));
return (T)(value & ~(alignment - 1));
return (T)((value / alignment) * alignment);
}
/// @brief: Same as previous one, but first parameter becomes pointer, for more