Support wave32/wave64 scratch allocations on gfx10

- Use new buffer resource descriptor layout
- Handle wave32 scratch allocation error from CP
- Make wavefront size a property of scratch allocation requests
- Repurpose wave64-specific amd_queue_t.scratch_workitem_byte_size field
- Clear index_stride field in V# on gfx10, calculated per-dispatch by CP

Change-Id: If2acdf6430772abd4d6a8c792fc8c11260764dda
Tento commit je obsažen v:
Jay Cornwall
2018-09-18 15:09:08 -05:00
odevzdal Chris Freehill
rodič 08841faf4c
revize f8d0ccd159
5 změnil soubory, kde provedl 103 přidání a 27 odebrání
+1
Zobrazit soubor
@@ -67,6 +67,7 @@ struct ScratchInfo {
void* queue_base;
size_t size;
size_t size_per_thread;
uint32_t lanes_per_wave;
ptrdiff_t queue_process_offset;
bool large;
bool retry;
+38
Zobrazit soubor
@@ -85,6 +85,10 @@ BUF_NUM_FORMAT_RESERVED_6__VI = 0x00000006,
BUF_NUM_FORMAT_FLOAT = 0x00000007,
} BUF_NUM_FORMAT;
typedef enum BUF_FORMAT {
BUF_FORMAT_32_UINT = 0x00000014,
} BUF_FORMAT;
typedef enum SQ_SEL_XYZW01 {
SQ_SEL_0 = 0x00000000,
SQ_SEL_1 = 0x00000001,
@@ -201,4 +205,38 @@ SQ_SEL_W = 0x00000007,
float f32All;
};
union SQ_BUF_RSRC_WORD3_GFX10 {
struct {
#if defined(LITTLEENDIAN_CPU)
unsigned int DST_SEL_X : 3;
unsigned int DST_SEL_Y : 3;
unsigned int DST_SEL_Z : 3;
unsigned int DST_SEL_W : 3;
unsigned int FORMAT : 7;
unsigned int RESERVED1 : 2;
unsigned int INDEX_STRIDE : 2;
unsigned int ADD_TID_ENABLE : 1;
unsigned int RESOURCE_LEVEL : 1;
unsigned int RESERVED2 : 3;
unsigned int OOB_SELECT : 2;
unsigned int TYPE : 2;
#elif defined(BIGENDIAN_CPU)
unsigned int TYPE : 2;
unsigned int OOB_SELECT : 2;
unsigned int RESERVED2 : 3;
unsigned int RESOURCE_LEVEL : 1;
unsigned int ADD_TID_ENABLE : 1;
unsigned int INDEX_STRIDE : 2;
unsigned int RESERVED1 : 2;
unsigned int FORMAT : 7;
unsigned int DST_SEL_W : 3;
unsigned int DST_SEL_Z : 3;
unsigned int DST_SEL_Y : 3;
unsigned int DST_SEL_X : 3;
#endif
} bitfields, bits;
unsigned int u32All;
signed int i32All;
float f32All;
};
#endif // header guard
+51 -23
Zobrazit soubor
@@ -749,7 +749,7 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
}
// Process only one queue error.
if (error_code == 1) {
if (error_code & 0x401) { // insufficient scratch, wave64 or wave32
// Insufficient scratch - recoverable, don't process dynamic scratch if errors are present.
auto& scratch = queue->queue_scratch_;
@@ -764,10 +764,11 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
uint32_t scratch_request = pkt.dispatch.private_segment_size;
scratch.size_per_thread = scratch_request;
scratch.lanes_per_wave = (error_code & 0x400) ? 32 : 64;
// Align whole waves to 1KB.
scratch.size_per_thread = AlignUp(scratch.size_per_thread, 16);
scratch.size_per_thread = AlignUp(scratch.size_per_thread, 1024 / scratch.lanes_per_wave);
scratch.size = scratch.size_per_thread * (queue->amd_queue_.max_cu_id + 1) *
queue->agent_->properties().MaxSlotsScratchCU * queue->agent_->properties().WaveFrontSize;
queue->agent_->properties().MaxSlotsScratchCU * scratch.lanes_per_wave;
queue->agent_->AcquireQueueScratch(scratch);
@@ -1001,7 +1002,7 @@ void AqlQueue::InitScratchSRD() {
SQ_BUF_RSRC_WORD0 srd0;
SQ_BUF_RSRC_WORD1 srd1;
SQ_BUF_RSRC_WORD2 srd2;
SQ_BUF_RSRC_WORD3 srd3;
uint32_t srd3_u32;
uint32_t scratch_base_hi = 0;
uintptr_t scratch_base = uintptr_t(queue_scratch_.queue_base);
@@ -1017,33 +1018,60 @@ void AqlQueue::InitScratchSRD() {
srd2.bits.NUM_RECORDS = uint32_t(queue_scratch_.size);
srd3.bits.DST_SEL_X = SQ_SEL_X;
srd3.bits.DST_SEL_Y = SQ_SEL_Y;
srd3.bits.DST_SEL_Z = SQ_SEL_Z;
srd3.bits.DST_SEL_W = SQ_SEL_W;
srd3.bits.NUM_FORMAT = BUF_NUM_FORMAT_UINT;
srd3.bits.DATA_FORMAT = BUF_DATA_FORMAT_32;
srd3.bits.ELEMENT_SIZE = 1; // 4
srd3.bits.INDEX_STRIDE = 3; // 64
srd3.bits.ADD_TID_ENABLE = 1;
srd3.bits.ATC__CI__VI = (agent_->profile() == HSA_PROFILE_FULL);
srd3.bits.HASH_ENABLE = 0;
srd3.bits.HEAP = 0;
srd3.bits.MTYPE__CI__VI = 0;
srd3.bits.TYPE = SQ_RSRC_BUF;
if (agent_->isa()->GetMajorVersion() < 10) {
SQ_BUF_RSRC_WORD3 srd3;
srd3.bits.DST_SEL_X = SQ_SEL_X;
srd3.bits.DST_SEL_Y = SQ_SEL_Y;
srd3.bits.DST_SEL_Z = SQ_SEL_Z;
srd3.bits.DST_SEL_W = SQ_SEL_W;
srd3.bits.NUM_FORMAT = BUF_NUM_FORMAT_UINT;
srd3.bits.DATA_FORMAT = BUF_DATA_FORMAT_32;
srd3.bits.ELEMENT_SIZE = 1; // 4
srd3.bits.INDEX_STRIDE = 3; // 64
srd3.bits.ADD_TID_ENABLE = 1;
srd3.bits.ATC__CI__VI = (agent_->profile() == HSA_PROFILE_FULL);
srd3.bits.HASH_ENABLE = 0;
srd3.bits.HEAP = 0;
srd3.bits.MTYPE__CI__VI = 0;
srd3.bits.TYPE = SQ_RSRC_BUF;
srd3_u32 = srd3.u32All;
} else {
SQ_BUF_RSRC_WORD3_GFX10 srd3;
srd3.bits.DST_SEL_X = SQ_SEL_X;
srd3.bits.DST_SEL_Y = SQ_SEL_Y;
srd3.bits.DST_SEL_Z = SQ_SEL_Z;
srd3.bits.DST_SEL_W = SQ_SEL_W;
srd3.bits.FORMAT = BUF_FORMAT_32_UINT;
srd3.bits.RESERVED1 = 0;
srd3.bits.INDEX_STRIDE = 0; // filled in by CP
srd3.bits.ADD_TID_ENABLE = 1;
srd3.bits.RESOURCE_LEVEL = 1;
srd3.bits.RESERVED2 = 0;
srd3.bits.OOB_SELECT = 2; // no bounds check in swizzle mode
srd3.bits.TYPE = SQ_RSRC_BUF;
srd3_u32 = srd3.u32All;
}
// Update Queue's Scratch descriptor's property
amd_queue_.scratch_resource_descriptor[0] = srd0.u32All;
amd_queue_.scratch_resource_descriptor[1] = srd1.u32All;
amd_queue_.scratch_resource_descriptor[2] = srd2.u32All;
amd_queue_.scratch_resource_descriptor[3] = srd3.u32All;
amd_queue_.scratch_resource_descriptor[3] = srd3_u32;
// Populate flat scratch parameters in amd_queue_.
amd_queue_.scratch_backing_memory_location =
queue_scratch_.queue_process_offset;
amd_queue_.scratch_backing_memory_byte_size = queue_scratch_.size;
amd_queue_.scratch_workitem_byte_size =
uint32_t(queue_scratch_.size_per_thread);
// For backwards compatibility this field records the per-lane scratch
// for a 64 lane wavefront. If scratch was allocated for 32 lane waves
// then the effective size for a 64 lane wave is halved.
amd_queue_.scratch_wave64_lane_byte_size =
uint32_t((queue_scratch_.size_per_thread * queue_scratch_.lanes_per_wave) / 64);
// Set concurrent wavefront limits only when scratch is being used.
COMPUTE_TMPRING_SIZE tmpring_size = {};
@@ -1059,8 +1087,8 @@ void AqlQueue::InitScratchSRD() {
// Scratch is allocated program COMPUTE_TMPRING_SIZE register
// Scratch Size per Wave is specified in terms of kilobytes
uint32_t wave_size = agent_props.WaveFrontSize;
uint32_t wave_scratch = (((wave_size * queue_scratch_.size_per_thread) + 1023) / 1024);
uint32_t wave_scratch = (((queue_scratch_.lanes_per_wave *
queue_scratch_.size_per_thread) + 1023) / 1024);
tmpring_size.bits.WAVESIZE = wave_scratch;
assert(wave_scratch == tmpring_size.bits.WAVESIZE && "WAVESIZE Overflow.");
uint32_t num_waves = (queue_scratch_.size / (tmpring_size.bits.WAVESIZE * 1024));
+12 -3
Zobrazit soubor
@@ -886,13 +886,22 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type,
// Allocate scratch memory
ScratchInfo scratch;
if (private_segment_size == UINT_MAX) {
private_segment_size = 0;
private_segment_size = (profile_ == HSA_PROFILE_BASE) ? 0 : scratch_per_thread_;
}
if (private_segment_size > 262128) {
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
scratch.lanes_per_wave = 64;
scratch.size_per_thread = AlignUp(private_segment_size, 1024 / scratch.lanes_per_wave);
if (scratch.size_per_thread > 262128) {
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
scratch.size_per_thread = private_segment_size;
const uint32_t num_cu = properties_.NumFComputeCores / properties_.NumSIMDPerCU;
scratch.size =
scratch.size_per_thread * properties_.MaxSlotsScratchCU * properties_.WaveFrontSize * num_cu;
scratch.size = scratch.size_per_thread * 32 * scratch.lanes_per_wave * num_cu;
scratch.queue_base = nullptr;
scratch.queue_process_offset = 0;
+1 -1
Zobrazit soubor
@@ -77,7 +77,7 @@ typedef struct AMD_QUEUE_ALIGN amd_queue_s {
uint32_t scratch_resource_descriptor[4];
uint64_t scratch_backing_memory_location;
uint64_t scratch_backing_memory_byte_size;
uint32_t scratch_workitem_byte_size;
uint32_t scratch_wave64_lane_byte_size;
amd_queue_properties32_t queue_properties;
uint32_t reserved3[2];
hsa_signal_t queue_inactive_signal;