Support wave32/wave64 scratch allocations on gfx10
- Use new buffer resource descriptor layout - Handle wave32 scratch allocation error from CP - Make wavefront size a property of scratch allocation requests - Repurpose wave64-specific amd_queue_t.scratch_workitem_byte_size field - Clear index_stride field in V# on gfx10, calculated per-dispatch by CP Change-Id: If2acdf6430772abd4d6a8c792fc8c11260764dda
Tento commit je obsažen v:
@@ -67,6 +67,7 @@ struct ScratchInfo {
|
||||
void* queue_base;
|
||||
size_t size;
|
||||
size_t size_per_thread;
|
||||
uint32_t lanes_per_wave;
|
||||
ptrdiff_t queue_process_offset;
|
||||
bool large;
|
||||
bool retry;
|
||||
|
||||
@@ -85,6 +85,10 @@ BUF_NUM_FORMAT_RESERVED_6__VI = 0x00000006,
|
||||
BUF_NUM_FORMAT_FLOAT = 0x00000007,
|
||||
} BUF_NUM_FORMAT;
|
||||
|
||||
typedef enum BUF_FORMAT {
|
||||
BUF_FORMAT_32_UINT = 0x00000014,
|
||||
} BUF_FORMAT;
|
||||
|
||||
typedef enum SQ_SEL_XYZW01 {
|
||||
SQ_SEL_0 = 0x00000000,
|
||||
SQ_SEL_1 = 0x00000001,
|
||||
@@ -201,4 +205,38 @@ SQ_SEL_W = 0x00000007,
|
||||
float f32All;
|
||||
};
|
||||
|
||||
union SQ_BUF_RSRC_WORD3_GFX10 {
|
||||
struct {
|
||||
#if defined(LITTLEENDIAN_CPU)
|
||||
unsigned int DST_SEL_X : 3;
|
||||
unsigned int DST_SEL_Y : 3;
|
||||
unsigned int DST_SEL_Z : 3;
|
||||
unsigned int DST_SEL_W : 3;
|
||||
unsigned int FORMAT : 7;
|
||||
unsigned int RESERVED1 : 2;
|
||||
unsigned int INDEX_STRIDE : 2;
|
||||
unsigned int ADD_TID_ENABLE : 1;
|
||||
unsigned int RESOURCE_LEVEL : 1;
|
||||
unsigned int RESERVED2 : 3;
|
||||
unsigned int OOB_SELECT : 2;
|
||||
unsigned int TYPE : 2;
|
||||
#elif defined(BIGENDIAN_CPU)
|
||||
unsigned int TYPE : 2;
|
||||
unsigned int OOB_SELECT : 2;
|
||||
unsigned int RESERVED2 : 3;
|
||||
unsigned int RESOURCE_LEVEL : 1;
|
||||
unsigned int ADD_TID_ENABLE : 1;
|
||||
unsigned int INDEX_STRIDE : 2;
|
||||
unsigned int RESERVED1 : 2;
|
||||
unsigned int FORMAT : 7;
|
||||
unsigned int DST_SEL_W : 3;
|
||||
unsigned int DST_SEL_Z : 3;
|
||||
unsigned int DST_SEL_Y : 3;
|
||||
unsigned int DST_SEL_X : 3;
|
||||
#endif
|
||||
} bitfields, bits;
|
||||
unsigned int u32All;
|
||||
signed int i32All;
|
||||
float f32All;
|
||||
};
|
||||
#endif // header guard
|
||||
|
||||
@@ -749,7 +749,7 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
|
||||
}
|
||||
|
||||
// Process only one queue error.
|
||||
if (error_code == 1) {
|
||||
if (error_code & 0x401) { // insufficient scratch, wave64 or wave32
|
||||
// Insufficient scratch - recoverable, don't process dynamic scratch if errors are present.
|
||||
auto& scratch = queue->queue_scratch_;
|
||||
|
||||
@@ -764,10 +764,11 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
|
||||
uint32_t scratch_request = pkt.dispatch.private_segment_size;
|
||||
|
||||
scratch.size_per_thread = scratch_request;
|
||||
scratch.lanes_per_wave = (error_code & 0x400) ? 32 : 64;
|
||||
// Align whole waves to 1KB.
|
||||
scratch.size_per_thread = AlignUp(scratch.size_per_thread, 16);
|
||||
scratch.size_per_thread = AlignUp(scratch.size_per_thread, 1024 / scratch.lanes_per_wave);
|
||||
scratch.size = scratch.size_per_thread * (queue->amd_queue_.max_cu_id + 1) *
|
||||
queue->agent_->properties().MaxSlotsScratchCU * queue->agent_->properties().WaveFrontSize;
|
||||
queue->agent_->properties().MaxSlotsScratchCU * scratch.lanes_per_wave;
|
||||
|
||||
queue->agent_->AcquireQueueScratch(scratch);
|
||||
|
||||
@@ -1001,7 +1002,7 @@ void AqlQueue::InitScratchSRD() {
|
||||
SQ_BUF_RSRC_WORD0 srd0;
|
||||
SQ_BUF_RSRC_WORD1 srd1;
|
||||
SQ_BUF_RSRC_WORD2 srd2;
|
||||
SQ_BUF_RSRC_WORD3 srd3;
|
||||
uint32_t srd3_u32;
|
||||
|
||||
uint32_t scratch_base_hi = 0;
|
||||
uintptr_t scratch_base = uintptr_t(queue_scratch_.queue_base);
|
||||
@@ -1017,33 +1018,60 @@ void AqlQueue::InitScratchSRD() {
|
||||
|
||||
srd2.bits.NUM_RECORDS = uint32_t(queue_scratch_.size);
|
||||
|
||||
srd3.bits.DST_SEL_X = SQ_SEL_X;
|
||||
srd3.bits.DST_SEL_Y = SQ_SEL_Y;
|
||||
srd3.bits.DST_SEL_Z = SQ_SEL_Z;
|
||||
srd3.bits.DST_SEL_W = SQ_SEL_W;
|
||||
srd3.bits.NUM_FORMAT = BUF_NUM_FORMAT_UINT;
|
||||
srd3.bits.DATA_FORMAT = BUF_DATA_FORMAT_32;
|
||||
srd3.bits.ELEMENT_SIZE = 1; // 4
|
||||
srd3.bits.INDEX_STRIDE = 3; // 64
|
||||
srd3.bits.ADD_TID_ENABLE = 1;
|
||||
srd3.bits.ATC__CI__VI = (agent_->profile() == HSA_PROFILE_FULL);
|
||||
srd3.bits.HASH_ENABLE = 0;
|
||||
srd3.bits.HEAP = 0;
|
||||
srd3.bits.MTYPE__CI__VI = 0;
|
||||
srd3.bits.TYPE = SQ_RSRC_BUF;
|
||||
if (agent_->isa()->GetMajorVersion() < 10) {
|
||||
SQ_BUF_RSRC_WORD3 srd3;
|
||||
|
||||
srd3.bits.DST_SEL_X = SQ_SEL_X;
|
||||
srd3.bits.DST_SEL_Y = SQ_SEL_Y;
|
||||
srd3.bits.DST_SEL_Z = SQ_SEL_Z;
|
||||
srd3.bits.DST_SEL_W = SQ_SEL_W;
|
||||
srd3.bits.NUM_FORMAT = BUF_NUM_FORMAT_UINT;
|
||||
srd3.bits.DATA_FORMAT = BUF_DATA_FORMAT_32;
|
||||
srd3.bits.ELEMENT_SIZE = 1; // 4
|
||||
srd3.bits.INDEX_STRIDE = 3; // 64
|
||||
srd3.bits.ADD_TID_ENABLE = 1;
|
||||
srd3.bits.ATC__CI__VI = (agent_->profile() == HSA_PROFILE_FULL);
|
||||
srd3.bits.HASH_ENABLE = 0;
|
||||
srd3.bits.HEAP = 0;
|
||||
srd3.bits.MTYPE__CI__VI = 0;
|
||||
srd3.bits.TYPE = SQ_RSRC_BUF;
|
||||
|
||||
srd3_u32 = srd3.u32All;
|
||||
} else {
|
||||
SQ_BUF_RSRC_WORD3_GFX10 srd3;
|
||||
|
||||
srd3.bits.DST_SEL_X = SQ_SEL_X;
|
||||
srd3.bits.DST_SEL_Y = SQ_SEL_Y;
|
||||
srd3.bits.DST_SEL_Z = SQ_SEL_Z;
|
||||
srd3.bits.DST_SEL_W = SQ_SEL_W;
|
||||
srd3.bits.FORMAT = BUF_FORMAT_32_UINT;
|
||||
srd3.bits.RESERVED1 = 0;
|
||||
srd3.bits.INDEX_STRIDE = 0; // filled in by CP
|
||||
srd3.bits.ADD_TID_ENABLE = 1;
|
||||
srd3.bits.RESOURCE_LEVEL = 1;
|
||||
srd3.bits.RESERVED2 = 0;
|
||||
srd3.bits.OOB_SELECT = 2; // no bounds check in swizzle mode
|
||||
srd3.bits.TYPE = SQ_RSRC_BUF;
|
||||
|
||||
srd3_u32 = srd3.u32All;
|
||||
}
|
||||
|
||||
// Update Queue's Scratch descriptor's property
|
||||
amd_queue_.scratch_resource_descriptor[0] = srd0.u32All;
|
||||
amd_queue_.scratch_resource_descriptor[1] = srd1.u32All;
|
||||
amd_queue_.scratch_resource_descriptor[2] = srd2.u32All;
|
||||
amd_queue_.scratch_resource_descriptor[3] = srd3.u32All;
|
||||
amd_queue_.scratch_resource_descriptor[3] = srd3_u32;
|
||||
|
||||
// Populate flat scratch parameters in amd_queue_.
|
||||
amd_queue_.scratch_backing_memory_location =
|
||||
queue_scratch_.queue_process_offset;
|
||||
amd_queue_.scratch_backing_memory_byte_size = queue_scratch_.size;
|
||||
amd_queue_.scratch_workitem_byte_size =
|
||||
uint32_t(queue_scratch_.size_per_thread);
|
||||
|
||||
// For backwards compatibility this field records the per-lane scratch
|
||||
// for a 64 lane wavefront. If scratch was allocated for 32 lane waves
|
||||
// then the effective size for a 64 lane wave is halved.
|
||||
amd_queue_.scratch_wave64_lane_byte_size =
|
||||
uint32_t((queue_scratch_.size_per_thread * queue_scratch_.lanes_per_wave) / 64);
|
||||
|
||||
// Set concurrent wavefront limits only when scratch is being used.
|
||||
COMPUTE_TMPRING_SIZE tmpring_size = {};
|
||||
@@ -1059,8 +1087,8 @@ void AqlQueue::InitScratchSRD() {
|
||||
|
||||
// Scratch is allocated program COMPUTE_TMPRING_SIZE register
|
||||
// Scratch Size per Wave is specified in terms of kilobytes
|
||||
uint32_t wave_size = agent_props.WaveFrontSize;
|
||||
uint32_t wave_scratch = (((wave_size * queue_scratch_.size_per_thread) + 1023) / 1024);
|
||||
uint32_t wave_scratch = (((queue_scratch_.lanes_per_wave *
|
||||
queue_scratch_.size_per_thread) + 1023) / 1024);
|
||||
tmpring_size.bits.WAVESIZE = wave_scratch;
|
||||
assert(wave_scratch == tmpring_size.bits.WAVESIZE && "WAVESIZE Overflow.");
|
||||
uint32_t num_waves = (queue_scratch_.size / (tmpring_size.bits.WAVESIZE * 1024));
|
||||
|
||||
@@ -886,13 +886,22 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type,
|
||||
// Allocate scratch memory
|
||||
ScratchInfo scratch;
|
||||
if (private_segment_size == UINT_MAX) {
|
||||
private_segment_size = 0;
|
||||
private_segment_size = (profile_ == HSA_PROFILE_BASE) ? 0 : scratch_per_thread_;
|
||||
}
|
||||
|
||||
if (private_segment_size > 262128) {
|
||||
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
}
|
||||
|
||||
scratch.lanes_per_wave = 64;
|
||||
scratch.size_per_thread = AlignUp(private_segment_size, 1024 / scratch.lanes_per_wave);
|
||||
if (scratch.size_per_thread > 262128) {
|
||||
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
}
|
||||
scratch.size_per_thread = private_segment_size;
|
||||
|
||||
const uint32_t num_cu = properties_.NumFComputeCores / properties_.NumSIMDPerCU;
|
||||
scratch.size =
|
||||
scratch.size_per_thread * properties_.MaxSlotsScratchCU * properties_.WaveFrontSize * num_cu;
|
||||
scratch.size = scratch.size_per_thread * 32 * scratch.lanes_per_wave * num_cu;
|
||||
scratch.queue_base = nullptr;
|
||||
scratch.queue_process_offset = 0;
|
||||
|
||||
|
||||
@@ -77,7 +77,7 @@ typedef struct AMD_QUEUE_ALIGN amd_queue_s {
|
||||
uint32_t scratch_resource_descriptor[4];
|
||||
uint64_t scratch_backing_memory_location;
|
||||
uint64_t scratch_backing_memory_byte_size;
|
||||
uint32_t scratch_workitem_byte_size;
|
||||
uint32_t scratch_wave64_lane_byte_size;
|
||||
amd_queue_properties32_t queue_properties;
|
||||
uint32_t reserved3[2];
|
||||
hsa_signal_t queue_inactive_signal;
|
||||
|
||||
Odkázat v novém úkolu
Zablokovat Uživatele