From f8d0ccd159b088df5063bc6a84b89fda082132fd Mon Sep 17 00:00:00 2001
From: Jay Cornwall <Jay.Cornwall@amd.com>
Date: Tue, 18 Sep 2018 15:09:08 -0500
Subject: [PATCH] Support wave32/wave64 scratch allocations on gfx10

- Use new buffer resource descriptor layout
- Handle wave32 scratch allocation error from CP
- Make wavefront size a property of scratch allocation requests
- Repurpose wave64-specific amd_queue_t.scratch_workitem_byte_size field
- Clear index_stride field in V# on gfx10, calculated per-dispatch by CP

Change-Id: If2acdf6430772abd4d6a8c792fc8c11260764dda
---
 runtime/hsa-runtime/core/inc/amd_gpu_agent.h  |  1 +
 runtime/hsa-runtime/core/inc/registers.h      | 38 ++++++++++
 .../core/runtime/amd_aql_queue.cpp            | 74 +++++++++++++------
 .../core/runtime/amd_gpu_agent.cpp            | 15 +++-
 runtime/hsa-runtime/inc/amd_hsa_queue.h       |  2 +-
 5 files changed, 103 insertions(+), 27 deletions(-)

diff --git a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h
index c7791ad294..61aefcdec8 100644
--- a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h
+++ b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h
@@ -67,6 +67,7 @@ struct ScratchInfo {
   void* queue_base;
   size_t size;
   size_t size_per_thread;
+  uint32_t lanes_per_wave;
   ptrdiff_t queue_process_offset;
   bool large;
   bool retry;
diff --git a/runtime/hsa-runtime/core/inc/registers.h b/runtime/hsa-runtime/core/inc/registers.h
index 39d86aecf3..211ff5f9d3 100644
--- a/runtime/hsa-runtime/core/inc/registers.h
+++ b/runtime/hsa-runtime/core/inc/registers.h
@@ -85,6 +85,10 @@ BUF_NUM_FORMAT_RESERVED_6__VI            = 0x00000006,
 BUF_NUM_FORMAT_FLOAT                     = 0x00000007,
 } BUF_NUM_FORMAT;
 
+typedef enum BUF_FORMAT {
+BUF_FORMAT_32_UINT                       = 0x00000014,
+} BUF_FORMAT;
+
 typedef enum SQ_SEL_XYZW01 {
 SQ_SEL_0                                 = 0x00000000,
 SQ_SEL_1                                 = 0x00000001,
@@ -201,4 +205,38 @@ SQ_SEL_W                                 = 0x00000007,
 	float	f32All;
 	};
 
+	union SQ_BUF_RSRC_WORD3_GFX10 {
+	struct {
+#if		defined(LITTLEENDIAN_CPU)
+                unsigned int                       DST_SEL_X : 3;
+                unsigned int                       DST_SEL_Y : 3;
+                unsigned int                       DST_SEL_Z : 3;
+                unsigned int                       DST_SEL_W : 3;
+                unsigned int                          FORMAT : 7;
+                unsigned int                       RESERVED1 : 2;
+                unsigned int                    INDEX_STRIDE : 2;
+                unsigned int                  ADD_TID_ENABLE : 1;
+                unsigned int                  RESOURCE_LEVEL : 1;
+                unsigned int                       RESERVED2 : 3;
+                unsigned int                      OOB_SELECT : 2;
+                unsigned int                            TYPE : 2;
+#elif		defined(BIGENDIAN_CPU)
+                unsigned int                            TYPE : 2;
+                unsigned int                      OOB_SELECT : 2;
+                unsigned int                       RESERVED2 : 3;
+                unsigned int                  RESOURCE_LEVEL : 1;
+                unsigned int                  ADD_TID_ENABLE : 1;
+                unsigned int                    INDEX_STRIDE : 2;
+                unsigned int                       RESERVED1 : 2;
+                unsigned int                          FORMAT : 7;
+                unsigned int                       DST_SEL_W : 3;
+                unsigned int                       DST_SEL_Z : 3;
+                unsigned int                       DST_SEL_Y : 3;
+                unsigned int                       DST_SEL_X : 3;
+#endif
+	} bitfields, bits;
+	unsigned int	u32All;
+	signed int	i32All;
+	float	f32All;
+	};
 #endif  // header guard
diff --git a/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp
index c3f284b086..f2bdf85d3d 100644
--- a/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp
@@ -749,7 +749,7 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
     }
 
     // Process only one queue error.
-    if (error_code == 1) {
+    if (error_code & 0x401) {  // insufficient scratch, wave64 or wave32
       // Insufficient scratch - recoverable, don't process dynamic scratch if errors are present.
       auto& scratch = queue->queue_scratch_;
 
@@ -764,10 +764,11 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
       uint32_t scratch_request = pkt.dispatch.private_segment_size;
 
       scratch.size_per_thread = scratch_request;
+      scratch.lanes_per_wave = (error_code & 0x400) ? 32 : 64;
       // Align whole waves to 1KB.
-      scratch.size_per_thread = AlignUp(scratch.size_per_thread, 16);
+      scratch.size_per_thread = AlignUp(scratch.size_per_thread, 1024 / scratch.lanes_per_wave);
       scratch.size = scratch.size_per_thread * (queue->amd_queue_.max_cu_id + 1) *
-          queue->agent_->properties().MaxSlotsScratchCU * queue->agent_->properties().WaveFrontSize;
+          queue->agent_->properties().MaxSlotsScratchCU * scratch.lanes_per_wave;
 
       queue->agent_->AcquireQueueScratch(scratch);
 
@@ -1001,7 +1002,7 @@ void AqlQueue::InitScratchSRD() {
   SQ_BUF_RSRC_WORD0 srd0;
   SQ_BUF_RSRC_WORD1 srd1;
   SQ_BUF_RSRC_WORD2 srd2;
-  SQ_BUF_RSRC_WORD3 srd3;
+  uint32_t srd3_u32;
 
   uint32_t scratch_base_hi = 0;
   uintptr_t scratch_base = uintptr_t(queue_scratch_.queue_base);
@@ -1017,33 +1018,60 @@ void AqlQueue::InitScratchSRD() {
 
   srd2.bits.NUM_RECORDS = uint32_t(queue_scratch_.size);
 
-  srd3.bits.DST_SEL_X = SQ_SEL_X;
-  srd3.bits.DST_SEL_Y = SQ_SEL_Y;
-  srd3.bits.DST_SEL_Z = SQ_SEL_Z;
-  srd3.bits.DST_SEL_W = SQ_SEL_W;
-  srd3.bits.NUM_FORMAT = BUF_NUM_FORMAT_UINT;
-  srd3.bits.DATA_FORMAT = BUF_DATA_FORMAT_32;
-  srd3.bits.ELEMENT_SIZE = 1;  // 4
-  srd3.bits.INDEX_STRIDE = 3;  // 64
-  srd3.bits.ADD_TID_ENABLE = 1;
-  srd3.bits.ATC__CI__VI = (agent_->profile() == HSA_PROFILE_FULL);
-  srd3.bits.HASH_ENABLE = 0;
-  srd3.bits.HEAP = 0;
-  srd3.bits.MTYPE__CI__VI = 0;
-  srd3.bits.TYPE = SQ_RSRC_BUF;
+  if (agent_->isa()->GetMajorVersion() < 10) {
+    SQ_BUF_RSRC_WORD3 srd3;
+
+    srd3.bits.DST_SEL_X = SQ_SEL_X;
+    srd3.bits.DST_SEL_Y = SQ_SEL_Y;
+    srd3.bits.DST_SEL_Z = SQ_SEL_Z;
+    srd3.bits.DST_SEL_W = SQ_SEL_W;
+    srd3.bits.NUM_FORMAT = BUF_NUM_FORMAT_UINT;
+    srd3.bits.DATA_FORMAT = BUF_DATA_FORMAT_32;
+    srd3.bits.ELEMENT_SIZE = 1;  // 4
+    srd3.bits.INDEX_STRIDE = 3;  // 64
+    srd3.bits.ADD_TID_ENABLE = 1;
+    srd3.bits.ATC__CI__VI = (agent_->profile() == HSA_PROFILE_FULL);
+    srd3.bits.HASH_ENABLE = 0;
+    srd3.bits.HEAP = 0;
+    srd3.bits.MTYPE__CI__VI = 0;
+    srd3.bits.TYPE = SQ_RSRC_BUF;
+
+    srd3_u32 = srd3.u32All;
+  } else {
+    SQ_BUF_RSRC_WORD3_GFX10 srd3;
+
+    srd3.bits.DST_SEL_X = SQ_SEL_X;
+    srd3.bits.DST_SEL_Y = SQ_SEL_Y;
+    srd3.bits.DST_SEL_Z = SQ_SEL_Z;
+    srd3.bits.DST_SEL_W = SQ_SEL_W;
+    srd3.bits.FORMAT = BUF_FORMAT_32_UINT;
+    srd3.bits.RESERVED1 = 0;
+    srd3.bits.INDEX_STRIDE = 0;  // filled in by CP
+    srd3.bits.ADD_TID_ENABLE = 1;
+    srd3.bits.RESOURCE_LEVEL = 1;
+    srd3.bits.RESERVED2 = 0;
+    srd3.bits.OOB_SELECT = 2;  // no bounds check in swizzle mode
+    srd3.bits.TYPE = SQ_RSRC_BUF;
+
+    srd3_u32 = srd3.u32All;
+  }
 
   // Update Queue's Scratch descriptor's property
   amd_queue_.scratch_resource_descriptor[0] = srd0.u32All;
   amd_queue_.scratch_resource_descriptor[1] = srd1.u32All;
   amd_queue_.scratch_resource_descriptor[2] = srd2.u32All;
-  amd_queue_.scratch_resource_descriptor[3] = srd3.u32All;
+  amd_queue_.scratch_resource_descriptor[3] = srd3_u32;
 
   // Populate flat scratch parameters in amd_queue_.
   amd_queue_.scratch_backing_memory_location =
       queue_scratch_.queue_process_offset;
   amd_queue_.scratch_backing_memory_byte_size = queue_scratch_.size;
-  amd_queue_.scratch_workitem_byte_size =
-      uint32_t(queue_scratch_.size_per_thread);
+
+  // For backwards compatibility this field records the per-lane scratch
+  // for a 64 lane wavefront. If scratch was allocated for 32 lane waves
+  // then the effective size for a 64 lane wave is halved.
+  amd_queue_.scratch_wave64_lane_byte_size =
+      uint32_t((queue_scratch_.size_per_thread * queue_scratch_.lanes_per_wave) / 64);
 
   // Set concurrent wavefront limits only when scratch is being used.
   COMPUTE_TMPRING_SIZE tmpring_size = {};
@@ -1059,8 +1087,8 @@ void AqlQueue::InitScratchSRD() {
 
   // Scratch is allocated program COMPUTE_TMPRING_SIZE register
   // Scratch Size per Wave is specified in terms of kilobytes
-  uint32_t wave_size = agent_props.WaveFrontSize;
-  uint32_t wave_scratch = (((wave_size * queue_scratch_.size_per_thread) + 1023) / 1024);
+  uint32_t wave_scratch = (((queue_scratch_.lanes_per_wave *
+                               queue_scratch_.size_per_thread) + 1023) / 1024);
   tmpring_size.bits.WAVESIZE = wave_scratch;
   assert(wave_scratch == tmpring_size.bits.WAVESIZE && "WAVESIZE Overflow.");
   uint32_t num_waves = (queue_scratch_.size / (tmpring_size.bits.WAVESIZE * 1024));
diff --git a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
index 5b27486ddc..78c10ec230 100644
--- a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
@@ -886,13 +886,22 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type,
   // Allocate scratch memory
   ScratchInfo scratch;
   if (private_segment_size == UINT_MAX) {
-    private_segment_size = 0;
+    private_segment_size = (profile_ == HSA_PROFILE_BASE) ? 0 : scratch_per_thread_;
+  }
+
+  if (private_segment_size > 262128) {
+    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+  }
+
+  scratch.lanes_per_wave = 64;
+  scratch.size_per_thread = AlignUp(private_segment_size, 1024 / scratch.lanes_per_wave);
+  if (scratch.size_per_thread > 262128) {
+    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
   }
   scratch.size_per_thread = private_segment_size;
 
   const uint32_t num_cu = properties_.NumFComputeCores / properties_.NumSIMDPerCU;
-  scratch.size =
-      scratch.size_per_thread * properties_.MaxSlotsScratchCU * properties_.WaveFrontSize * num_cu;
+  scratch.size = scratch.size_per_thread * 32 * scratch.lanes_per_wave * num_cu;
   scratch.queue_base = nullptr;
   scratch.queue_process_offset = 0;
 
diff --git a/runtime/hsa-runtime/inc/amd_hsa_queue.h b/runtime/hsa-runtime/inc/amd_hsa_queue.h
index 2176e84706..2da98964da 100644
--- a/runtime/hsa-runtime/inc/amd_hsa_queue.h
+++ b/runtime/hsa-runtime/inc/amd_hsa_queue.h
@@ -77,7 +77,7 @@ typedef struct AMD_QUEUE_ALIGN amd_queue_s {
   uint32_t scratch_resource_descriptor[4];
   uint64_t scratch_backing_memory_location;
   uint64_t scratch_backing_memory_byte_size;
-  uint32_t scratch_workitem_byte_size;
+  uint32_t scratch_wave64_lane_byte_size;
   amd_queue_properties32_t queue_properties;
   uint32_t reserved3[2];
   hsa_signal_t queue_inactive_signal;