Scratch memory changes to support multi-xcc

Change-Id: I115ba4cfe250c59cb7421217cfe0fad6302f25b3


[ROCm/ROCR-Runtime commit: 9554e95de0]
This commit is contained in:
Mike Li
2023-04-11 22:30:28 +00:00
کامیت شده توسط David Yat Sin
والد 3603303bc7
کامیت dae51188d8
@@ -1329,7 +1329,11 @@ void AqlQueue::FillBufRsrcWord1_Gfx11() {
void AqlQueue::FillBufRsrcWord2() {
SQ_BUF_RSRC_WORD2 srd2;
srd2.bits.NUM_RECORDS = uint32_t(queue_scratch_.size);
const auto& agent_props = agent_->properties();
const uint32_t num_xcc = agent_props.NumXcc;
// report size per XCC
srd2.bits.NUM_RECORDS = uint32_t(queue_scratch_.size / num_xcc);
amd_queue_.scratch_resource_descriptor[2] = srd2.u32All;
}
@@ -1400,8 +1404,10 @@ void AqlQueue::FillComputeTmpRingSize() {
return;
}
// Determine the maximum number of waves device can support
const auto& agent_props = agent_->properties();
const uint32_t num_xcc = agent_props.NumXcc;
// Determine the maximum number of waves device can support
uint32_t num_cus = agent_props.NumFComputeCores / agent_props.NumSIMDPerCU;
uint32_t max_scratch_waves = num_cus * agent_props.MaxSlotsScratchCU;
@@ -1413,10 +1419,11 @@ void AqlQueue::FillComputeTmpRingSize() {
tmpring_size.bits.WAVESIZE = wave_scratch;
assert(wave_scratch == tmpring_size.bits.WAVESIZE && "WAVESIZE Overflow.");
uint32_t num_waves =
queue_scratch_.size / (tmpring_size.bits.WAVESIZE * queue_scratch_.mem_alignment_size);
(queue_scratch_.size / num_xcc) / (tmpring_size.bits.WAVESIZE * queue_scratch_.mem_alignment_size);
tmpring_size.bits.WAVES = std::min(num_waves, max_scratch_waves);
amd_queue_.compute_tmpring_size = tmpring_size.u32All;
assert((tmpring_size.bits.WAVES % agent_props.NumShaderBanks == 0) &&
assert((tmpring_size.bits.WAVES % (agent_props.NumShaderBanks / num_xcc) == 0) &&
"Invalid scratch wave count. Must be divisible by #SEs.");
}
@@ -1428,9 +1435,11 @@ void AqlQueue::FillComputeTmpRingSize_Gfx11() {
return;
}
// Determine the maximum number of waves device can support
const auto& agent_props = agent_->properties();
uint32_t num_cus = agent_props.NumFComputeCores / agent_props.NumSIMDPerCU;
const uint32_t num_xcc = agent_props.NumXcc;
// Determine the maximum number of waves device can support
uint32_t num_cus = agent_props.NumFComputeCores / (agent_props.NumSIMDPerCU * num_xcc);
uint32_t max_scratch_waves = num_cus * agent_props.MaxSlotsScratchCU;
// Scratch is allocated program COMPUTE_TMPRING_SIZE register
@@ -1480,7 +1489,11 @@ void AqlQueue::InitScratchSRD() {
// Populate flat scratch parameters in amd_queue_.
amd_queue_.scratch_backing_memory_location = queue_scratch_.queue_process_offset;
amd_queue_.scratch_backing_memory_byte_size = queue_scratch_.size;
const auto& agent_props = agent_->properties();
const uint32_t num_xcc = agent_props.NumXcc;
// report size per XCC
amd_queue_.scratch_backing_memory_byte_size = queue_scratch_.size / num_xcc;
// For backwards compatibility this field records the per-lane scratch
// for a 64 lane wavefront. If scratch was allocated for 32 lane waves