Fix scratch memory alignment on GFX11

GFX11 requires scratch memory alignment of 256 Bytes instead of 1024.

Change-Id: I103de1c12f3a4877d7d36f13254301166c66e11f
Этот коммит содержится в:
David Yat Sin
2022-06-22 16:23:02 -04:00
родитель 90322899fe
Коммит c2a60a4d5d
2 изменённых файлов: 22 добавлений и 8 удалений
+1
Просмотреть файл
@@ -95,6 +95,7 @@ class ScratchCache {
uint32_t lanes_per_wave;
uint32_t waves_per_group;
uint64_t wanted_slots;
uint32_t mem_alignment_size;
bool cooperative;
ptrdiff_t queue_process_offset;
bool large;
+21 -8
Просмотреть файл
@@ -213,6 +213,11 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr
assert(amd_queue_.private_segment_aperture_base_hi != 0 && "No private region found.");
}
if (agent_->isa()->GetMajorVersion() >= 11)
queue_scratch_.mem_alignment_size = 256;
else
queue_scratch_.mem_alignment_size = 1024;
MAKE_NAMED_SCOPE_GUARD(EventGuard, [&]() {
ScopedAcquire<KernelMutex> _lock(&queue_lock_);
queue_count_--;
@@ -839,8 +844,10 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
scratch.size_per_thread = scratch_request;
scratch.lanes_per_wave = (error_code & 0x400) ? 32 : 64;
// Align whole waves to 1KB.
scratch.size_per_thread = AlignUp(scratch.size_per_thread, 1024 / scratch.lanes_per_wave);
scratch.size_per_thread =
AlignUp(scratch.size_per_thread, scratch.mem_alignment_size / scratch.lanes_per_wave);
scratch.size = scratch.size_per_thread * MaxScratchSlots * scratch.lanes_per_wave;
// Smaller dispatches may not need to reach full device occupancy.
@@ -1389,11 +1396,13 @@ void AqlQueue::FillComputeTmpRingSize() {
// Scratch is allocated program COMPUTE_TMPRING_SIZE register
// Scratch Size per Wave is specified in terms of kilobytes
uint32_t wave_scratch = (((queue_scratch_.lanes_per_wave *
queue_scratch_.size_per_thread) + 1023) / 1024);
uint32_t wave_scratch = (((queue_scratch_.lanes_per_wave * queue_scratch_.size_per_thread) +
queue_scratch_.mem_alignment_size - 1) /
queue_scratch_.mem_alignment_size);
tmpring_size.bits.WAVESIZE = wave_scratch;
assert(wave_scratch == tmpring_size.bits.WAVESIZE && "WAVESIZE Overflow.");
uint32_t num_waves = queue_scratch_.size / (tmpring_size.bits.WAVESIZE * 1024);
uint32_t num_waves =
queue_scratch_.size / (tmpring_size.bits.WAVESIZE * queue_scratch_.mem_alignment_size);
tmpring_size.bits.WAVES = std::min(num_waves, max_scratch_waves);
amd_queue_.compute_tmpring_size = tmpring_size.u32All;
assert((tmpring_size.bits.WAVES % agent_props.NumShaderBanks == 0) &&
@@ -1415,11 +1424,15 @@ void AqlQueue::FillComputeTmpRingSize_Gfx11() {
// Scratch is allocated program COMPUTE_TMPRING_SIZE register
// Scratch Size per Wave is specified in terms of kilobytes
uint32_t wave_scratch =
(((queue_scratch_.lanes_per_wave * queue_scratch_.size_per_thread) + 255) / 256);
uint32_t wave_scratch = (((queue_scratch_.lanes_per_wave * queue_scratch_.size_per_thread) +
queue_scratch_.mem_alignment_size - 1) /
queue_scratch_.mem_alignment_size);
tmpring_size.bits.WAVESIZE = wave_scratch;
assert(wave_scratch == tmpring_size.bits.WAVESIZE && "WAVESIZE Overflow.");
uint32_t num_waves = queue_scratch_.size / (tmpring_size.bits.WAVESIZE * 256);
uint32_t num_waves =
queue_scratch_.size / (tmpring_size.bits.WAVESIZE * queue_scratch_.mem_alignment_size);
// For GFX11 we specify number of waves per engine instead of total
num_waves /= agent_->properties().NumShaderBanks;