diff --git a/projects/rocr-runtime/src/queues.c b/projects/rocr-runtime/src/queues.c index 33b81fc09c..29949c9092 100644 --- a/projects/rocr-runtime/src/queues.c +++ b/projects/rocr-runtime/src/queues.c @@ -47,10 +47,9 @@ #define LDS_SIZE_PER_CU 0x10000 #define HWREG_SIZE_PER_CU 0x1000 #define WG_CONTEXT_DATA_SIZE_PER_CU(asic_family) (VGPR_SIZE_PER_CU(asic_family) + SGPR_SIZE_PER_CU + LDS_SIZE_PER_CU + HWREG_SIZE_PER_CU) -#define WAVES_PER_CU 32 -#define CNTL_STACK_BYTES_PER_CU(asic_family) (WAVES_PER_CU * (asic_family >= CHIP_NAVI10 ? 12 : 8)) +#define CNTL_STACK_BYTES_PER_WAVE(asic_family) (asic_family >= CHIP_NAVI10 ? 12 : 8) #define DEBUGGER_BYTES_ALIGN 64 -#define DEBUGGER_BYTES_PER_CU(asic_family) (WAVES_PER_CU * 32) +#define DEBUGGER_BYTES_PER_WAVE(asic_family) 32 struct device_info { enum asic_family_type asic_family; @@ -434,8 +433,11 @@ static bool update_ctx_save_restore_size(uint32_t nodeid, struct queue *q) if (node.NumFComputeCores && node.NumSIMDPerCU) { uint32_t ctl_stack_size, wg_data_size; uint32_t cu_num = node.NumFComputeCores / node.NumSIMDPerCU; + uint32_t wave_num = (q->dev_info->asic_family < CHIP_NAVI10) + ? MIN(cu_num * 40, node.NumShaderBanks / node.NumArrays * 512) + : cu_num * 32; - ctl_stack_size = cu_num * CNTL_STACK_BYTES_PER_CU(q->dev_info->asic_family) + 8; + ctl_stack_size = wave_num * CNTL_STACK_BYTES_PER_WAVE(q->dev_info->asic_family) + 8; wg_data_size = cu_num * WG_CONTEXT_DATA_SIZE_PER_CU(q->dev_info->asic_family); q->ctl_stack_size = PAGE_ALIGN_UP(sizeof(HsaUserContextSaveAreaHeader) + ctl_stack_size); @@ -449,7 +451,7 @@ static bool update_ctx_save_restore_size(uint32_t nodeid, struct queue *q) } q->debug_memory_size = - ALIGN_UP(cu_num * DEBUGGER_BYTES_PER_CU(q->dev_info->asic_family), DEBUGGER_BYTES_ALIGN); + ALIGN_UP(wave_num * DEBUGGER_BYTES_PER_WAVE(q->dev_info->asic_family), DEBUGGER_BYTES_ALIGN); q->ctx_save_restore_size = q->ctl_stack_size + PAGE_ALIGN_UP(wg_data_size + q->debug_memory_size);