From f8d0ccd159b088df5063bc6a84b89fda082132fd Mon Sep 17 00:00:00 2001 From: Jay Cornwall Date: Tue, 18 Sep 2018 15:09:08 -0500 Subject: [PATCH] Support wave32/wave64 scratch allocations on gfx10 - Use new buffer resource descriptor layout - Handle wave32 scratch allocation error from CP - Make wavefront size a property of scratch allocation requests - Repurpose wave64-specific amd_queue_t.scratch_workitem_byte_size field - Clear index_stride field in V# on gfx10, calculated per-dispatch by CP Change-Id: If2acdf6430772abd4d6a8c792fc8c11260764dda --- runtime/hsa-runtime/core/inc/amd_gpu_agent.h | 1 + runtime/hsa-runtime/core/inc/registers.h | 38 ++++++++++ .../core/runtime/amd_aql_queue.cpp | 74 +++++++++++++------ .../core/runtime/amd_gpu_agent.cpp | 15 +++- runtime/hsa-runtime/inc/amd_hsa_queue.h | 2 +- 5 files changed, 103 insertions(+), 27 deletions(-) diff --git a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h index c7791ad294..61aefcdec8 100644 --- a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h +++ b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h @@ -67,6 +67,7 @@ struct ScratchInfo { void* queue_base; size_t size; size_t size_per_thread; + uint32_t lanes_per_wave; ptrdiff_t queue_process_offset; bool large; bool retry; diff --git a/runtime/hsa-runtime/core/inc/registers.h b/runtime/hsa-runtime/core/inc/registers.h index 39d86aecf3..211ff5f9d3 100644 --- a/runtime/hsa-runtime/core/inc/registers.h +++ b/runtime/hsa-runtime/core/inc/registers.h @@ -85,6 +85,10 @@ BUF_NUM_FORMAT_RESERVED_6__VI = 0x00000006, BUF_NUM_FORMAT_FLOAT = 0x00000007, } BUF_NUM_FORMAT; +typedef enum BUF_FORMAT { +BUF_FORMAT_32_UINT = 0x00000014, +} BUF_FORMAT; + typedef enum SQ_SEL_XYZW01 { SQ_SEL_0 = 0x00000000, SQ_SEL_1 = 0x00000001, @@ -201,4 +205,38 @@ SQ_SEL_W = 0x00000007, float f32All; }; + union SQ_BUF_RSRC_WORD3_GFX10 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int DST_SEL_X : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_W : 3; + unsigned int FORMAT : 7; + unsigned int RESERVED1 : 2; + unsigned int INDEX_STRIDE : 2; + unsigned int ADD_TID_ENABLE : 1; + unsigned int RESOURCE_LEVEL : 1; + unsigned int RESERVED2 : 3; + unsigned int OOB_SELECT : 2; + unsigned int TYPE : 2; +#elif defined(BIGENDIAN_CPU) + unsigned int TYPE : 2; + unsigned int OOB_SELECT : 2; + unsigned int RESERVED2 : 3; + unsigned int RESOURCE_LEVEL : 1; + unsigned int ADD_TID_ENABLE : 1; + unsigned int INDEX_STRIDE : 2; + unsigned int RESERVED1 : 2; + unsigned int FORMAT : 7; + unsigned int DST_SEL_W : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_X : 3; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; #endif // header guard diff --git a/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp index c3f284b086..f2bdf85d3d 100644 --- a/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp @@ -749,7 +749,7 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) { } // Process only one queue error. - if (error_code == 1) { + if (error_code & 0x401) { // insufficient scratch, wave64 or wave32 // Insufficient scratch - recoverable, don't process dynamic scratch if errors are present. auto& scratch = queue->queue_scratch_; @@ -764,10 +764,11 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) { uint32_t scratch_request = pkt.dispatch.private_segment_size; scratch.size_per_thread = scratch_request; + scratch.lanes_per_wave = (error_code & 0x400) ? 32 : 64; // Align whole waves to 1KB. - scratch.size_per_thread = AlignUp(scratch.size_per_thread, 16); + scratch.size_per_thread = AlignUp(scratch.size_per_thread, 1024 / scratch.lanes_per_wave); scratch.size = scratch.size_per_thread * (queue->amd_queue_.max_cu_id + 1) * - queue->agent_->properties().MaxSlotsScratchCU * queue->agent_->properties().WaveFrontSize; + queue->agent_->properties().MaxSlotsScratchCU * scratch.lanes_per_wave; queue->agent_->AcquireQueueScratch(scratch); @@ -1001,7 +1002,7 @@ void AqlQueue::InitScratchSRD() { SQ_BUF_RSRC_WORD0 srd0; SQ_BUF_RSRC_WORD1 srd1; SQ_BUF_RSRC_WORD2 srd2; - SQ_BUF_RSRC_WORD3 srd3; + uint32_t srd3_u32; uint32_t scratch_base_hi = 0; uintptr_t scratch_base = uintptr_t(queue_scratch_.queue_base); @@ -1017,33 +1018,60 @@ void AqlQueue::InitScratchSRD() { srd2.bits.NUM_RECORDS = uint32_t(queue_scratch_.size); - srd3.bits.DST_SEL_X = SQ_SEL_X; - srd3.bits.DST_SEL_Y = SQ_SEL_Y; - srd3.bits.DST_SEL_Z = SQ_SEL_Z; - srd3.bits.DST_SEL_W = SQ_SEL_W; - srd3.bits.NUM_FORMAT = BUF_NUM_FORMAT_UINT; - srd3.bits.DATA_FORMAT = BUF_DATA_FORMAT_32; - srd3.bits.ELEMENT_SIZE = 1; // 4 - srd3.bits.INDEX_STRIDE = 3; // 64 - srd3.bits.ADD_TID_ENABLE = 1; - srd3.bits.ATC__CI__VI = (agent_->profile() == HSA_PROFILE_FULL); - srd3.bits.HASH_ENABLE = 0; - srd3.bits.HEAP = 0; - srd3.bits.MTYPE__CI__VI = 0; - srd3.bits.TYPE = SQ_RSRC_BUF; + if (agent_->isa()->GetMajorVersion() < 10) { + SQ_BUF_RSRC_WORD3 srd3; + + srd3.bits.DST_SEL_X = SQ_SEL_X; + srd3.bits.DST_SEL_Y = SQ_SEL_Y; + srd3.bits.DST_SEL_Z = SQ_SEL_Z; + srd3.bits.DST_SEL_W = SQ_SEL_W; + srd3.bits.NUM_FORMAT = BUF_NUM_FORMAT_UINT; + srd3.bits.DATA_FORMAT = BUF_DATA_FORMAT_32; + srd3.bits.ELEMENT_SIZE = 1; // 4 + srd3.bits.INDEX_STRIDE = 3; // 64 + srd3.bits.ADD_TID_ENABLE = 1; + srd3.bits.ATC__CI__VI = (agent_->profile() == HSA_PROFILE_FULL); + srd3.bits.HASH_ENABLE = 0; + srd3.bits.HEAP = 0; + srd3.bits.MTYPE__CI__VI = 0; + srd3.bits.TYPE = SQ_RSRC_BUF; + + srd3_u32 = srd3.u32All; + } else { + SQ_BUF_RSRC_WORD3_GFX10 srd3; + + srd3.bits.DST_SEL_X = SQ_SEL_X; + srd3.bits.DST_SEL_Y = SQ_SEL_Y; + srd3.bits.DST_SEL_Z = SQ_SEL_Z; + srd3.bits.DST_SEL_W = SQ_SEL_W; + srd3.bits.FORMAT = BUF_FORMAT_32_UINT; + srd3.bits.RESERVED1 = 0; + srd3.bits.INDEX_STRIDE = 0; // filled in by CP + srd3.bits.ADD_TID_ENABLE = 1; + srd3.bits.RESOURCE_LEVEL = 1; + srd3.bits.RESERVED2 = 0; + srd3.bits.OOB_SELECT = 2; // no bounds check in swizzle mode + srd3.bits.TYPE = SQ_RSRC_BUF; + + srd3_u32 = srd3.u32All; + } // Update Queue's Scratch descriptor's property amd_queue_.scratch_resource_descriptor[0] = srd0.u32All; amd_queue_.scratch_resource_descriptor[1] = srd1.u32All; amd_queue_.scratch_resource_descriptor[2] = srd2.u32All; - amd_queue_.scratch_resource_descriptor[3] = srd3.u32All; + amd_queue_.scratch_resource_descriptor[3] = srd3_u32; // Populate flat scratch parameters in amd_queue_. amd_queue_.scratch_backing_memory_location = queue_scratch_.queue_process_offset; amd_queue_.scratch_backing_memory_byte_size = queue_scratch_.size; - amd_queue_.scratch_workitem_byte_size = - uint32_t(queue_scratch_.size_per_thread); + + // For backwards compatibility this field records the per-lane scratch + // for a 64 lane wavefront. If scratch was allocated for 32 lane waves + // then the effective size for a 64 lane wave is halved. + amd_queue_.scratch_wave64_lane_byte_size = + uint32_t((queue_scratch_.size_per_thread * queue_scratch_.lanes_per_wave) / 64); // Set concurrent wavefront limits only when scratch is being used. COMPUTE_TMPRING_SIZE tmpring_size = {}; @@ -1059,8 +1087,8 @@ void AqlQueue::InitScratchSRD() { // Scratch is allocated program COMPUTE_TMPRING_SIZE register // Scratch Size per Wave is specified in terms of kilobytes - uint32_t wave_size = agent_props.WaveFrontSize; - uint32_t wave_scratch = (((wave_size * queue_scratch_.size_per_thread) + 1023) / 1024); + uint32_t wave_scratch = (((queue_scratch_.lanes_per_wave * + queue_scratch_.size_per_thread) + 1023) / 1024); tmpring_size.bits.WAVESIZE = wave_scratch; assert(wave_scratch == tmpring_size.bits.WAVESIZE && "WAVESIZE Overflow."); uint32_t num_waves = (queue_scratch_.size / (tmpring_size.bits.WAVESIZE * 1024)); diff --git a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp index 5b27486ddc..78c10ec230 100644 --- a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp @@ -886,13 +886,22 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type, // Allocate scratch memory ScratchInfo scratch; if (private_segment_size == UINT_MAX) { - private_segment_size = 0; + private_segment_size = (profile_ == HSA_PROFILE_BASE) ? 0 : scratch_per_thread_; + } + + if (private_segment_size > 262128) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + scratch.lanes_per_wave = 64; + scratch.size_per_thread = AlignUp(private_segment_size, 1024 / scratch.lanes_per_wave); + if (scratch.size_per_thread > 262128) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } scratch.size_per_thread = private_segment_size; const uint32_t num_cu = properties_.NumFComputeCores / properties_.NumSIMDPerCU; - scratch.size = - scratch.size_per_thread * properties_.MaxSlotsScratchCU * properties_.WaveFrontSize * num_cu; + scratch.size = scratch.size_per_thread * 32 * scratch.lanes_per_wave * num_cu; scratch.queue_base = nullptr; scratch.queue_process_offset = 0; diff --git a/runtime/hsa-runtime/inc/amd_hsa_queue.h b/runtime/hsa-runtime/inc/amd_hsa_queue.h index 2176e84706..2da98964da 100644 --- a/runtime/hsa-runtime/inc/amd_hsa_queue.h +++ b/runtime/hsa-runtime/inc/amd_hsa_queue.h @@ -77,7 +77,7 @@ typedef struct AMD_QUEUE_ALIGN amd_queue_s { uint32_t scratch_resource_descriptor[4]; uint64_t scratch_backing_memory_location; uint64_t scratch_backing_memory_byte_size; - uint32_t scratch_workitem_byte_size; + uint32_t scratch_wave64_lane_byte_size; amd_queue_properties32_t queue_properties; uint32_t reserved3[2]; hsa_signal_t queue_inactive_signal;