// // Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved. // #include "device/gpu/gpukernel.hpp" #include #include #include #include #include #include "acl.h" #define R900_BUILD 1 #include "SCShadersR800.h" #include "r8xx_r9xx_merged__offset.h" #include "r8xx_r9xx_merged__typedef.h" namespace gpu { #define NUM_R800_CS_INFOS (0x22+SC_R800_MAX_UAV+ \ 3+ /* globalReturnBuffer flag plus numUavs and numGlobalReturnBuffers */ \ 1+ /* extendedCaching flag */ \ 3+ /* globalReturnBuffer sizes for dword, shorts and bytes */ \ 3*SC_R800_MAX_UAV+ /* offsetmap, cached and uncached fetch consts */ \ 2*SC_R800_MAX_UAV+ /* 64- and 128-bit cached fetch consts */ \ 2*R800_GLOBAL_RTN_BUF_LAST /* global return buffer fetch consts and type */ ) struct Options { uint numClauseTemps_; uint numGPRs_; uint numThreads_; uint numStackEntries_; uint ldsSize_; Options(CALtarget target) { numClauseTemps_ = 4; switch (target) { case CAL_TARGET_DEVASTATOR: case CAL_TARGET_SCRAPPER: case CAL_TARGET_CAYMAN: case CAL_TARGET_KAUAI: numClauseTemps_ = 0; numStackEntries_ = 512; numThreads_ = 248; break; case CAL_TARGET_SUPERSUMO: case CAL_TARGET_TURKS: case CAL_TARGET_REDWOOD: numStackEntries_ = 256; numThreads_ = 248; break; case CAL_TARGET_WRESTLER: case CAL_TARGET_SUMO: case CAL_TARGET_CAICOS: case CAL_TARGET_CEDAR: numStackEntries_ = 256; numThreads_ = 192; break; case CAL_TARGET_CYPRESS: case CAL_TARGET_BARTS: case CAL_TARGET_JUNIPER: numStackEntries_ = 512; numThreads_ = 248; break; default: numStackEntries_ = 512; numThreads_ = 248; LogError("Unknown ASIC type"); } numGPRs_ = 256 - 2 * numClauseTemps_; ldsSize_ = 32*1024; } private: Options(); Options(const Options&); Options& operator=(const Options&); }; static const uint UncachedFetchConst[SC_R800_MAX_UAV] = { 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 173 }; static const uint CachedFetchConst[SC_R800_MAX_UAV] = { 144, 145, 146, 148, 149, 150, 151, 152, 0, 0, 0, 153 }; static const uint GlobalReturnFetchConst[R800_GLOBAL_RTN_BUF_LAST] = { 165, 166, 167, 168, 169, 170, 171, 172 }; static const uint GlobalReturnBufferType[R800_GLOBAL_RTN_BUF_LAST] = { AMU_ABI_UAV_FORMAT_TYPELESS, AMU_ABI_UAV_FORMAT_FLOAT, AMU_ABI_UAV_FORMAT_UNORM, AMU_ABI_UAV_FORMAT_SNORM, AMU_ABI_UAV_FORMAT_UINT, AMU_ABI_UAV_FORMAT_SINT, AMU_ABI_UAV_FORMAT_SHORT, AMU_ABI_UAV_FORMAT_BYTE }; static const uint CachedFetchConst64[SC_R800_MAX_UAV] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 174 }; static const uint CachedFetchConst128[SC_R800_MAX_UAV] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 175 }; bool NullKernel::r800CreateHwInfo(const void* shader, AMUabiAddEncoding& encoding) { CALProgramInfoEntry* newInfos; const Options options(nullDev().calTarget()); uint i = 0; uint numShaderEngines = 1; if ((nullDev().calTarget() == CAL_TARGET_CAYMAN) || (nullDev().calTarget() == CAL_TARGET_CYPRESS) || (nullDev().calTarget() == CAL_TARGET_BARTS)) { numShaderEngines = 2; } uint infoCount = NUM_R800_CS_INFOS; SC_R800CSHWSHADER* cShader = (SC_R800CSHWSHADER *)shader; if (cShader->u32NumThreadPerGroup == 0) { return false; } newInfos = new CALProgramInfoEntry[infoCount]; encoding.progInfos = newInfos; if (encoding.progInfos == 0) { infoCount = 0; return false; } memset(newInfos, 0, infoCount * sizeof(CALProgramInfoEntry)); newInfos[i].address = mmSQ_PGM_START_LS; newInfos[i].value = 0x0; i++; newInfos[i].address = mmSQ_PGM_RESOURCES_LS; cShader->sqPgmResourcesCs.bits.UNCACHED_FIRST_INST = 1; cShader->sqPgmResourcesCs.bits.PRIME_CACHE_ENABLE = 1; cShader->sqPgmResourcesCs.bits.PRIME_CACHE_ON_CONST = 0; newInfos[i].value = cShader->sqPgmResourcesCs.u32All; i++; newInfos[i].address = mmSQ_PGM_RESOURCES_2_LS; newInfos[i].value = cShader->sqPgmResources2Cs.u32All; i++; newInfos[i].address = mmSPI_THREAD_GROUPING; regSPI_THREAD_GROUPING spi_thread_grouping; spi_thread_grouping.u32All = 0; spi_thread_grouping.bits.PS_GROUPING = 0; spi_thread_grouping.bits.VS_GROUPING = 0; spi_thread_grouping.bits.ES_GROUPING = 0; spi_thread_grouping.bits.GS_GROUPING = 0; // dyn_gpr_mgmt if CS_GROUPING = 1. spi_thread_grouping.bits.CS_GROUPING = 0; newInfos[i].value = spi_thread_grouping.u32All; i++; const unsigned int numSharedGPR = cShader->u32NumSharedGprTotal; newInfos[i].address = mmSQ_DYN_GPR_CNTL_PS_FLUSH_REQ; regSQ_DYN_GPR_CNTL_PS_FLUSH_REQ sq_dyn_gpr_cntl_ps_flush_req; sq_dyn_gpr_cntl_ps_flush_req.u32All = 0; sq_dyn_gpr_cntl_ps_flush_req.bits.RING0_OFFSET = numSharedGPR; newInfos[i].value = sq_dyn_gpr_cntl_ps_flush_req.u32All; i++; const unsigned int numClauseTemps = options.numClauseTemps_; const unsigned int MaxNumGPRsAvail = options.numGPRs_; newInfos[i].address = mmSQ_GPR_RESOURCE_MGMT_1; regSQ_GPR_RESOURCE_MGMT_1 sq_gpr_resource_mgmt_1; sq_gpr_resource_mgmt_1.u32All = 0; sq_gpr_resource_mgmt_1.bits.NUM_CLAUSE_TEMP_GPRS = numClauseTemps; newInfos[i].value = sq_gpr_resource_mgmt_1.u32All; i++; newInfos[i].address = mmSQ_GPR_RESOURCE_MGMT_3__EG; regSQ_GPR_RESOURCE_MGMT_3__EG sq_gpr_resource_mgmt_3; sq_gpr_resource_mgmt_3.u32All = 0; { const unsigned int numWavefrontPerSIMD = 1 ; // ?? cShader->u32NumWavefrontPerSIMD; if ((cShader->u32NumSharedGprUser != cShader->u32NumSharedGprTotal)) // cShader->bIsMaxNumWavePerSIMD) { // if running with a barrier, need to limit the number of wavefronts on a SIMD. // force max wavefronts run on a simd by adjusting the num_es_gprs pool that all es programs can // allocate from. (# of gprs the program uses * numWavefrontsPerSIMD) sq_gpr_resource_mgmt_3.bits.NUM_LS_GPRS = cShader->sqPgmResourcesCs.bits.NUM_GPRS * numWavefrontPerSIMD; } else { sq_gpr_resource_mgmt_3.bits.NUM_LS_GPRS = MaxNumGPRsAvail - numSharedGPR; } } newInfos[i].value = sq_gpr_resource_mgmt_3.u32All; i++; newInfos[i].address = mmSPI_GPR_MGMT; regSPI_GPR_MGMT spi_gpr_mgmt; spi_gpr_mgmt.u32All = 0; { const unsigned int numWavefrontPerSIMD = 1 ; // ?? cShader->u32NumWavefrontPerSIMD; if ((cShader->u32NumSharedGprUser != cShader->u32NumSharedGprTotal)) // cShader->bIsMaxNumWavePerSIMD) { // if running with a barrier, need to limit the number of wavefronts on a SIMD. // force max wavefronts run on a simd by adjusting the num_es_gprs pool that all es programs can // allocate from. (# of gprs the program uses * numWavefrontsPerSIMD) spi_gpr_mgmt.bits.NUM_LS_GPRS = (cShader->sqPgmResourcesCs.bits.NUM_GPRS * numWavefrontPerSIMD) >> 3; } else { spi_gpr_mgmt.bits.NUM_LS_GPRS = (MaxNumGPRsAvail - numSharedGPR) >> 3; } } newInfos[i].value = spi_gpr_mgmt.u32All; i++; newInfos[i].address = mmSPI_WAVE_MGMT_1; regSPI_WAVE_MGMT_1 spi_wave_mgmt_1; spi_wave_mgmt_1.u32All = 0; newInfos[i].value = spi_wave_mgmt_1.u32All; i++; newInfos[i].address = mmSPI_WAVE_MGMT_2; regSPI_WAVE_MGMT_2 spi_wave_mgmt_2; spi_wave_mgmt_2.u32All = 0; spi_wave_mgmt_2.bits.NUM_CS_WAVES_ONE_RING = (options.numThreads_) >> 3; newInfos[i].value = spi_wave_mgmt_2.u32All; i++; newInfos[i].address = mmSQ_THREAD_RESOURCE_MGMT__EG; regSQ_THREAD_RESOURCE_MGMT__EG sq_thread_resource_mgmt; sq_thread_resource_mgmt.u32All = 0; sq_thread_resource_mgmt.bits.NUM_PS_THREADS = 0; sq_thread_resource_mgmt.bits.NUM_VS_THREADS = 0; sq_thread_resource_mgmt.bits.NUM_GS_THREADS = 0; sq_thread_resource_mgmt.bits.NUM_ES_THREADS = 0; newInfos[i].value = sq_thread_resource_mgmt.u32All; i++; newInfos[i].address = mmSQ_THREAD_RESOURCE_MGMT_2__EG; regSQ_THREAD_RESOURCE_MGMT_2__EG sq_thread_resource_mgmt_2; sq_thread_resource_mgmt_2.u32All = 0; sq_thread_resource_mgmt_2.bits.NUM_HS_THREADS = 0; sq_thread_resource_mgmt_2.bits.NUM_LS_THREADS = options.numThreads_; newInfos[i].value = sq_thread_resource_mgmt_2.u32All; i++; regSPI_COMPUTE_INPUT_CNTL spi_dompute_input_cntl; spi_dompute_input_cntl.u32All = 0; spi_dompute_input_cntl.bits.DISABLE_INDEX_PACK = 1; spi_dompute_input_cntl.bits.TID_IN_GROUP_ENA = 1; spi_dompute_input_cntl.bits.TGID_ENA = 1; newInfos[i].address = mmSPI_COMPUTE_INPUT_CNTL; newInfos[i].value = spi_dompute_input_cntl.u32All; i++; newInfos[i].address = mmSQ_LDS_ALLOC; newInfos[i].value = cShader->sqLdsAllocCs.u32All; i++; //This is information passed from SC to GSL, there is no valid address, so make up one. newInfos[i].address = AMU_ABI_CS_MAX_SCRATCH_REGS; newInfos[i].value = cShader->MaxScratchRegsNeeded; i++; newInfos[i].address = AMU_ABI_CS_NUM_SHARED_GPR_USER; newInfos[i].value = cShader->u32NumSharedGprUser; i++; newInfos[i].address = AMU_ABI_CS_NUM_SHARED_GPR_TOTAL; newInfos[i].value = cShader->u32NumSharedGprTotal; i++; newInfos[i].address = AMU_ABI_NUM_THREAD_PER_GROUP; newInfos[i].value = cShader->u32NumThreadPerGroup; i++; newInfos[i].address = AMU_ABI_NUM_THREAD_PER_GROUP_X; newInfos[i].value = cShader->u32NumThreadPerGroup_x; i++; newInfos[i].address = AMU_ABI_NUM_THREAD_PER_GROUP_Y; newInfos[i].value = cShader->u32NumThreadPerGroup_y; i++; newInfos[i].address = AMU_ABI_NUM_THREAD_PER_GROUP_Z; newInfos[i].value = cShader->u32NumThreadPerGroup_z; i++; newInfos[i].address = AMU_ABI_TOTAL_NUM_THREAD_GROUP; newInfos[i].value = cShader->u32TotalNumThreadGroup; i++; newInfos[i].address = AMU_ABI_NUM_WAVEFRONT_PER_SIMD; newInfos[i].value = 1; i++; newInfos[i].address = AMU_ABI_IS_MAX_NUM_WAVE_PER_SIMD; newInfos[i].value = 0; // ?? i++; newInfos[i].address = AMU_ABI_SET_BUFFER_FOR_NUM_GROUP; newInfos[i].value = cShader->bSetBufferForNumGroup; i++; newInfos[i].address = AMU_ABI_RAT_OP_IS_USED; newInfos[i].value = cShader->u32RatOpIsUsed; i++; newInfos[i].address = AMU_ABI_RAT_ATOMIC_OP_IS_USED; newInfos[i].value = cShader->u32RatAtomicOpIsUsed; i++; newInfos[i].address = AMU_ABI_WAVEFRONT_SIZE; newInfos[i].value = nullDev().hwInfo()->simdWidth_ * 4; i++; newInfos[i].address = AMU_ABI_NUM_GPR_AVAIL; newInfos[i].value = options.numGPRs_; i++; newInfos[i].address = AMU_ABI_NUM_GPR_USED; newInfos[i].value = cShader->sqPgmResourcesCs.bits.NUM_GPRS; i++; newInfos[i].address = AMU_ABI_LDS_SIZE_AVAIL; newInfos[i].value = options.ldsSize_; i++; newInfos[i].address = AMU_ABI_LDS_SIZE_USED; newInfos[i].value = cShader->sqLdsAllocCs.bits.SIZE; i++; newInfos[i].address = AMU_ABI_STACK_SIZE_AVAIL; newInfos[i].value = options.numStackEntries_; i++; newInfos[i].address = AMU_ABI_STACK_SIZE_USED; newInfos[i].value = cShader->sqPgmResourcesCs.bits.STACK_SIZE; i++; for (unsigned int j = 0;j scUavRtnBufInfoTbl[j].stride; bufferSize *= 4; // convert from DWORDS to bytes // // multiply by the maximum number of threads in flight at one time // // 256 waves * 64 threads/wave * 2 shader engines (for 870) // bufferSize *= nullDev().hwInfo()->simdWidth_ * 4; // threads/wave bufferSize *= 256 * 4; // maximum number of waves bufferSize *= numShaderEngines; newInfos[i].address = AMU_ABI_SET_BUFFER_FOR_UAV_RET_BUFFER0 + j; newInfos[i].value = bufferSize; i++; } newInfos[i].address = AMU_ABI_GLOBAL_RETURN_BUFFER; newInfos[i].value = true; i++; // Always use extended caching with global return buffer newInfos[i].address = AMU_ABI_EXTENDED_CACHING; newInfos[i].value = true; i++; newInfos[i].address = AMU_ABI_NUM_GLOBAL_UAV; newInfos[i].value = SC_R800_MAX_UAV; i++; newInfos[i].address = AMU_ABI_NUM_GLOBAL_RETURN_BUFFER; newInfos[i].value = R800_GLOBAL_RTN_BUF_LAST; i++; { unsigned int bufferSize = cShader->u32GlobalRtnBufSlot; bufferSize *= 4; // convert from DWORDS to bytes // // multiply by the maximum number of threads in flight at one time // // 256 waves * 64 threads/wave * 2 shader engines (for 870) // bufferSize *= nullDev().hwInfo()->simdWidth_ * 4; // threads/wave bufferSize *= 256 * 4; // maximum number of waves bufferSize *= numShaderEngines; newInfos[i].address = AMU_ABI_GLOBAL_RETURN_BUFFER_SIZE; newInfos[i].value = bufferSize; i++; } { unsigned int bufferSize = cShader->u32GlobalRtnBufSlotShort; bufferSize *= 4; // convert from DWORDS to bytes // // multiply by the maximum number of threads in flight at one time // // 256 waves * 64 threads/wave * 2 shader engines (for 870) // bufferSize *= nullDev().hwInfo()->simdWidth_ * 4; // threads/wave bufferSize *= 256 * 4; // maximum number of waves bufferSize *= numShaderEngines; newInfos[i].address = AMU_ABI_GLOBAL_RETURN_BUFFER_SIZE_SHORT; newInfos[i].value = bufferSize; i++; } { unsigned int bufferSize = cShader->u32GlobalRtnBufSlotByte; bufferSize *= 4; // convert from DWORDS to bytes // // multiply by the maximum number of threads in flight at one time // // 256 waves * 64 threads/wave * 2 shader engines (for 870) // bufferSize *= nullDev().hwInfo()->simdWidth_ * 4; // threads/wave bufferSize *= 256 * 4; // maximum number of waves bufferSize *= numShaderEngines; newInfos[i].address = AMU_ABI_GLOBAL_RETURN_BUFFER_SIZE_BYTE; newInfos[i].value = bufferSize; i++; } for (unsigned int j = 0; j < SC_R800_MAX_UAV; j++) { newInfos[i].address = AMU_ABI_OFFSET_TO_UAV0+j; newInfos[i].value = j; i++; } for (unsigned int j = 0; j < SC_R800_MAX_UAV; j++) { // Set up UAV->fetch constant mapping for uncached newInfos[i].address = AMU_ABI_UNCACHED_FETCH_CONST_UAV0+j; newInfos[i].value = UncachedFetchConst[j]; i++; } for (unsigned int j = 0; j < SC_R800_MAX_UAV; j++) { newInfos[i].address = AMU_ABI_CACHED_FETCH_CONST_UAV0+j; newInfos[i].value = CachedFetchConst[j]; i++; } for (unsigned int j = 0; j < R800_GLOBAL_RTN_BUF_LAST; j++) { newInfos[i].address = AMU_ABI_GLOBAL_RETURN_FETCH_CONST0+j; newInfos[i].value = GlobalReturnFetchConst[j]; i++; } for (unsigned int j = 0; j < R800_GLOBAL_RTN_BUF_LAST; j++) { newInfos[i].address = AMU_ABI_GLOBAL_RETURN_BUFFER_TYPE0+j; newInfos[i].value = GlobalReturnBufferType[j]; i++; } for (unsigned int j = 0; j < SC_R800_MAX_UAV; j++) { newInfos[i].address = AMU_ABI_CACHED_FETCH_CONST64_UAV0+j; newInfos[i].value = CachedFetchConst64[j]; i++; } for (unsigned int j = 0; j < SC_R800_MAX_UAV; j++) { newInfos[i].address = AMU_ABI_CACHED_FETCH_CONST128_UAV0+j; newInfos[i].value = CachedFetchConst128[j]; i++; } assert(i == infoCount); encoding.progInfosCount = infoCount; encoding.uavMask.mask[0] = cShader->u32RatOpIsUsed; encoding.textData = HWSHADER_Get(cShader, hShaderMemHandle); encoding.textSize = cShader->CodeLenInByte; instructionCnt_ = encoding.textSize / sizeof(uint32_t); encoding.scratchRegisterCount = cShader->MaxScratchRegsNeeded; uint bufferSize = 0; bufferSize = cShader->u32GlobalRtnBufSlot + cShader->u32GlobalRtnBufSlotShort + cShader->u32GlobalRtnBufSlotByte; bufferSize *= 4; // convert from DWORDS to bytes // // multiply by the maximum number of threads in flight at one time // // 256 waves * 64 threads/wave * 2 shader engines (for 870) // bufferSize *= nullDev().hwInfo()->simdWidth_ * 4; // threads/wave bufferSize *= 256 * 4; // maximum number of waves bufferSize *= numShaderEngines; encoding.UAVReturnBufferTotalSize = bufferSize; return true; } } // namespace gpu