480 Zeilen
17 KiB
C++
480 Zeilen
17 KiB
C++
//
|
|
// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
|
|
//
|
|
|
|
#include "device/gpu/gpukernel.hpp"
|
|
#include <string>
|
|
#include <fstream>
|
|
#include <sstream>
|
|
#include <iostream>
|
|
#include <ctime>
|
|
|
|
#include "acl.h"
|
|
#define R900_BUILD 1
|
|
#include "SCShadersR800.h"
|
|
#include "r8xx_r9xx_merged__offset.h"
|
|
#include "r8xx_r9xx_merged__typedef.h"
|
|
|
|
namespace gpu {
|
|
|
|
#define NUM_R800_CS_INFOS (0x22+SC_R800_MAX_UAV+ \
|
|
3+ /* globalReturnBuffer flag plus numUavs and numGlobalReturnBuffers */ \
|
|
1+ /* extendedCaching flag */ \
|
|
3+ /* globalReturnBuffer sizes for dword, shorts and bytes */ \
|
|
3*SC_R800_MAX_UAV+ /* offsetmap, cached and uncached fetch consts */ \
|
|
2*SC_R800_MAX_UAV+ /* 64- and 128-bit cached fetch consts */ \
|
|
2*R800_GLOBAL_RTN_BUF_LAST /* global return buffer fetch consts and type */ )
|
|
|
|
struct Options {
|
|
uint numClauseTemps_;
|
|
uint numGPRs_;
|
|
uint numThreads_;
|
|
uint numStackEntries_;
|
|
uint ldsSize_;
|
|
|
|
Options(CALtarget target) {
|
|
numClauseTemps_ = 4;
|
|
|
|
switch (target) {
|
|
case CAL_TARGET_DEVASTATOR:
|
|
case CAL_TARGET_SCRAPPER:
|
|
case CAL_TARGET_CAYMAN:
|
|
case CAL_TARGET_KAUAI:
|
|
numClauseTemps_ = 0;
|
|
numStackEntries_ = 512;
|
|
numThreads_ = 248;
|
|
break;
|
|
case CAL_TARGET_SUPERSUMO:
|
|
case CAL_TARGET_TURKS:
|
|
case CAL_TARGET_REDWOOD:
|
|
numStackEntries_ = 256;
|
|
numThreads_ = 248;
|
|
break;
|
|
case CAL_TARGET_WRESTLER:
|
|
case CAL_TARGET_SUMO:
|
|
case CAL_TARGET_CAICOS:
|
|
case CAL_TARGET_CEDAR:
|
|
numStackEntries_ = 256;
|
|
numThreads_ = 192;
|
|
break;
|
|
case CAL_TARGET_CYPRESS:
|
|
case CAL_TARGET_BARTS:
|
|
case CAL_TARGET_JUNIPER:
|
|
numStackEntries_ = 512;
|
|
numThreads_ = 248;
|
|
break;
|
|
default:
|
|
numStackEntries_ = 512;
|
|
numThreads_ = 248;
|
|
LogError("Unknown ASIC type");
|
|
}
|
|
|
|
numGPRs_ = 256 - 2 * numClauseTemps_;
|
|
ldsSize_ = 32*1024;
|
|
}
|
|
private:
|
|
Options();
|
|
Options(const Options&);
|
|
Options& operator=(const Options&);
|
|
};
|
|
|
|
static const uint UncachedFetchConst[SC_R800_MAX_UAV] =
|
|
{ 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 173 };
|
|
|
|
static const uint CachedFetchConst[SC_R800_MAX_UAV] =
|
|
{ 144, 145, 146, 148, 149, 150, 151, 152, 0, 0, 0, 153 };
|
|
|
|
static const uint GlobalReturnFetchConst[R800_GLOBAL_RTN_BUF_LAST] =
|
|
{ 165, 166, 167, 168, 169, 170, 171, 172 };
|
|
|
|
static const uint GlobalReturnBufferType[R800_GLOBAL_RTN_BUF_LAST] =
|
|
{ AMU_ABI_UAV_FORMAT_TYPELESS, AMU_ABI_UAV_FORMAT_FLOAT,
|
|
AMU_ABI_UAV_FORMAT_UNORM, AMU_ABI_UAV_FORMAT_SNORM, AMU_ABI_UAV_FORMAT_UINT,
|
|
AMU_ABI_UAV_FORMAT_SINT, AMU_ABI_UAV_FORMAT_SHORT, AMU_ABI_UAV_FORMAT_BYTE };
|
|
|
|
static const uint CachedFetchConst64[SC_R800_MAX_UAV] =
|
|
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 174 };
|
|
|
|
static const uint CachedFetchConst128[SC_R800_MAX_UAV] =
|
|
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 175 };
|
|
|
|
bool
|
|
NullKernel::r800CreateHwInfo(const void* shader, AMUabiAddEncoding& encoding)
|
|
{
|
|
CALProgramInfoEntry* newInfos;
|
|
const Options options(nullDev().calTarget());
|
|
uint i = 0;
|
|
uint numShaderEngines = 1;
|
|
if ((nullDev().calTarget() == CAL_TARGET_CAYMAN) ||
|
|
(nullDev().calTarget() == CAL_TARGET_CYPRESS) ||
|
|
(nullDev().calTarget() == CAL_TARGET_BARTS)) {
|
|
numShaderEngines = 2;
|
|
}
|
|
|
|
uint infoCount = NUM_R800_CS_INFOS;
|
|
SC_R800CSHWSHADER* cShader = (SC_R800CSHWSHADER *)shader;
|
|
if (cShader->u32NumThreadPerGroup == 0) {
|
|
return false;
|
|
}
|
|
newInfos = new CALProgramInfoEntry[infoCount];
|
|
encoding.progInfos = newInfos;
|
|
if (encoding.progInfos == 0) {
|
|
infoCount = 0;
|
|
return false;
|
|
}
|
|
memset(newInfos, 0, infoCount * sizeof(CALProgramInfoEntry));
|
|
|
|
newInfos[i].address = mmSQ_PGM_START_LS;
|
|
newInfos[i].value = 0x0;
|
|
i++;
|
|
newInfos[i].address = mmSQ_PGM_RESOURCES_LS;
|
|
cShader->sqPgmResourcesCs.bits.UNCACHED_FIRST_INST = 1;
|
|
cShader->sqPgmResourcesCs.bits.PRIME_CACHE_ENABLE = 1;
|
|
cShader->sqPgmResourcesCs.bits.PRIME_CACHE_ON_CONST = 0;
|
|
newInfos[i].value = cShader->sqPgmResourcesCs.u32All;
|
|
i++;
|
|
newInfos[i].address = mmSQ_PGM_RESOURCES_2_LS;
|
|
newInfos[i].value = cShader->sqPgmResources2Cs.u32All;
|
|
i++;
|
|
|
|
newInfos[i].address = mmSPI_THREAD_GROUPING;
|
|
regSPI_THREAD_GROUPING spi_thread_grouping;
|
|
spi_thread_grouping.u32All = 0;
|
|
spi_thread_grouping.bits.PS_GROUPING = 0;
|
|
spi_thread_grouping.bits.VS_GROUPING = 0;
|
|
spi_thread_grouping.bits.ES_GROUPING = 0;
|
|
spi_thread_grouping.bits.GS_GROUPING = 0;
|
|
// dyn_gpr_mgmt if CS_GROUPING = 1.
|
|
spi_thread_grouping.bits.CS_GROUPING = 0;
|
|
newInfos[i].value = spi_thread_grouping.u32All;
|
|
i++;
|
|
|
|
const unsigned int numSharedGPR = cShader->u32NumSharedGprTotal;
|
|
newInfos[i].address = mmSQ_DYN_GPR_CNTL_PS_FLUSH_REQ;
|
|
regSQ_DYN_GPR_CNTL_PS_FLUSH_REQ sq_dyn_gpr_cntl_ps_flush_req;
|
|
sq_dyn_gpr_cntl_ps_flush_req.u32All = 0;
|
|
sq_dyn_gpr_cntl_ps_flush_req.bits.RING0_OFFSET = numSharedGPR;
|
|
newInfos[i].value = sq_dyn_gpr_cntl_ps_flush_req.u32All;
|
|
i++;
|
|
|
|
const unsigned int numClauseTemps = options.numClauseTemps_;
|
|
const unsigned int MaxNumGPRsAvail = options.numGPRs_;
|
|
newInfos[i].address = mmSQ_GPR_RESOURCE_MGMT_1;
|
|
regSQ_GPR_RESOURCE_MGMT_1 sq_gpr_resource_mgmt_1;
|
|
sq_gpr_resource_mgmt_1.u32All = 0;
|
|
sq_gpr_resource_mgmt_1.bits.NUM_CLAUSE_TEMP_GPRS = numClauseTemps;
|
|
newInfos[i].value = sq_gpr_resource_mgmt_1.u32All;
|
|
i++;
|
|
|
|
newInfos[i].address = mmSQ_GPR_RESOURCE_MGMT_3__EG;
|
|
regSQ_GPR_RESOURCE_MGMT_3__EG sq_gpr_resource_mgmt_3;
|
|
sq_gpr_resource_mgmt_3.u32All = 0;
|
|
{
|
|
const unsigned int numWavefrontPerSIMD = 1 ; // ?? cShader->u32NumWavefrontPerSIMD;
|
|
if ((cShader->u32NumSharedGprUser != cShader->u32NumSharedGprTotal)) // cShader->bIsMaxNumWavePerSIMD)
|
|
{
|
|
// if running with a barrier, need to limit the number of wavefronts on a SIMD.
|
|
// force max wavefronts run on a simd by adjusting the num_es_gprs pool that all es programs can
|
|
// allocate from. (# of gprs the program uses * numWavefrontsPerSIMD)
|
|
sq_gpr_resource_mgmt_3.bits.NUM_LS_GPRS = cShader->sqPgmResourcesCs.bits.NUM_GPRS * numWavefrontPerSIMD;
|
|
}
|
|
else
|
|
{
|
|
sq_gpr_resource_mgmt_3.bits.NUM_LS_GPRS = MaxNumGPRsAvail - numSharedGPR;
|
|
}
|
|
}
|
|
newInfos[i].value = sq_gpr_resource_mgmt_3.u32All;
|
|
i++;
|
|
|
|
newInfos[i].address = mmSPI_GPR_MGMT;
|
|
regSPI_GPR_MGMT spi_gpr_mgmt;
|
|
spi_gpr_mgmt.u32All = 0;
|
|
{
|
|
const unsigned int numWavefrontPerSIMD = 1 ; // ?? cShader->u32NumWavefrontPerSIMD;
|
|
if ((cShader->u32NumSharedGprUser != cShader->u32NumSharedGprTotal)) // cShader->bIsMaxNumWavePerSIMD)
|
|
{
|
|
// if running with a barrier, need to limit the number of wavefronts on a SIMD.
|
|
// force max wavefronts run on a simd by adjusting the num_es_gprs pool that all es programs can
|
|
// allocate from. (# of gprs the program uses * numWavefrontsPerSIMD)
|
|
spi_gpr_mgmt.bits.NUM_LS_GPRS = (cShader->sqPgmResourcesCs.bits.NUM_GPRS * numWavefrontPerSIMD) >> 3;
|
|
}
|
|
else
|
|
{
|
|
spi_gpr_mgmt.bits.NUM_LS_GPRS = (MaxNumGPRsAvail - numSharedGPR) >> 3;
|
|
}
|
|
}
|
|
newInfos[i].value = spi_gpr_mgmt.u32All;
|
|
i++;
|
|
|
|
|
|
newInfos[i].address = mmSPI_WAVE_MGMT_1;
|
|
regSPI_WAVE_MGMT_1 spi_wave_mgmt_1;
|
|
spi_wave_mgmt_1.u32All = 0;
|
|
newInfos[i].value = spi_wave_mgmt_1.u32All;
|
|
i++;
|
|
|
|
newInfos[i].address = mmSPI_WAVE_MGMT_2;
|
|
regSPI_WAVE_MGMT_2 spi_wave_mgmt_2;
|
|
spi_wave_mgmt_2.u32All = 0;
|
|
spi_wave_mgmt_2.bits.NUM_CS_WAVES_ONE_RING = (options.numThreads_) >> 3;
|
|
newInfos[i].value = spi_wave_mgmt_2.u32All;
|
|
i++;
|
|
|
|
newInfos[i].address = mmSQ_THREAD_RESOURCE_MGMT__EG;
|
|
regSQ_THREAD_RESOURCE_MGMT__EG sq_thread_resource_mgmt;
|
|
sq_thread_resource_mgmt.u32All = 0;
|
|
sq_thread_resource_mgmt.bits.NUM_PS_THREADS = 0;
|
|
sq_thread_resource_mgmt.bits.NUM_VS_THREADS = 0;
|
|
sq_thread_resource_mgmt.bits.NUM_GS_THREADS = 0;
|
|
sq_thread_resource_mgmt.bits.NUM_ES_THREADS = 0;
|
|
newInfos[i].value = sq_thread_resource_mgmt.u32All;
|
|
i++;
|
|
|
|
newInfos[i].address = mmSQ_THREAD_RESOURCE_MGMT_2__EG;
|
|
regSQ_THREAD_RESOURCE_MGMT_2__EG sq_thread_resource_mgmt_2;
|
|
sq_thread_resource_mgmt_2.u32All = 0;
|
|
sq_thread_resource_mgmt_2.bits.NUM_HS_THREADS = 0;
|
|
sq_thread_resource_mgmt_2.bits.NUM_LS_THREADS = options.numThreads_;
|
|
newInfos[i].value = sq_thread_resource_mgmt_2.u32All;
|
|
i++;
|
|
|
|
regSPI_COMPUTE_INPUT_CNTL spi_dompute_input_cntl;
|
|
spi_dompute_input_cntl.u32All = 0;
|
|
spi_dompute_input_cntl.bits.DISABLE_INDEX_PACK = 1;
|
|
spi_dompute_input_cntl.bits.TID_IN_GROUP_ENA = 1;
|
|
spi_dompute_input_cntl.bits.TGID_ENA = 1;
|
|
newInfos[i].address = mmSPI_COMPUTE_INPUT_CNTL;
|
|
newInfos[i].value = spi_dompute_input_cntl.u32All;
|
|
i++;
|
|
|
|
newInfos[i].address = mmSQ_LDS_ALLOC;
|
|
newInfos[i].value = cShader->sqLdsAllocCs.u32All;
|
|
i++;
|
|
|
|
//This is information passed from SC to GSL, there is no valid address, so make up one.
|
|
newInfos[i].address = AMU_ABI_CS_MAX_SCRATCH_REGS;
|
|
newInfos[i].value = cShader->MaxScratchRegsNeeded;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_CS_NUM_SHARED_GPR_USER;
|
|
newInfos[i].value = cShader->u32NumSharedGprUser;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_CS_NUM_SHARED_GPR_TOTAL;
|
|
newInfos[i].value = cShader->u32NumSharedGprTotal;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_NUM_THREAD_PER_GROUP;
|
|
newInfos[i].value = cShader->u32NumThreadPerGroup;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_NUM_THREAD_PER_GROUP_X;
|
|
newInfos[i].value = cShader->u32NumThreadPerGroup_x;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_NUM_THREAD_PER_GROUP_Y;
|
|
newInfos[i].value = cShader->u32NumThreadPerGroup_y;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_NUM_THREAD_PER_GROUP_Z;
|
|
newInfos[i].value = cShader->u32NumThreadPerGroup_z;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_TOTAL_NUM_THREAD_GROUP;
|
|
newInfos[i].value = cShader->u32TotalNumThreadGroup;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_NUM_WAVEFRONT_PER_SIMD;
|
|
newInfos[i].value = 1;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_IS_MAX_NUM_WAVE_PER_SIMD;
|
|
newInfos[i].value = 0; // ??
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_SET_BUFFER_FOR_NUM_GROUP;
|
|
newInfos[i].value = cShader->bSetBufferForNumGroup;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_RAT_OP_IS_USED;
|
|
newInfos[i].value = cShader->u32RatOpIsUsed;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_RAT_ATOMIC_OP_IS_USED;
|
|
newInfos[i].value = cShader->u32RatAtomicOpIsUsed;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_WAVEFRONT_SIZE;
|
|
newInfos[i].value = nullDev().hwInfo()->simdWidth_ * 4;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_NUM_GPR_AVAIL;
|
|
newInfos[i].value = options.numGPRs_;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_NUM_GPR_USED;
|
|
newInfos[i].value = cShader->sqPgmResourcesCs.bits.NUM_GPRS;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_LDS_SIZE_AVAIL;
|
|
newInfos[i].value = options.ldsSize_;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_LDS_SIZE_USED;
|
|
newInfos[i].value = cShader->sqLdsAllocCs.bits.SIZE;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_STACK_SIZE_AVAIL;
|
|
newInfos[i].value = options.numStackEntries_;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_STACK_SIZE_USED;
|
|
newInfos[i].value = cShader->sqPgmResourcesCs.bits.STACK_SIZE;
|
|
i++;
|
|
|
|
for (unsigned int j = 0;j <SC_R800_MAX_UAV; j++)
|
|
{
|
|
unsigned int bufferSize = cShader->scUavRtnBufInfoTbl[j].stride;
|
|
|
|
bufferSize *= 4; // convert from DWORDS to bytes
|
|
|
|
//
|
|
// multiply by the maximum number of threads in flight at one time
|
|
//
|
|
// 256 waves * 64 threads/wave * 2 shader engines (for 870)
|
|
//
|
|
bufferSize *= nullDev().hwInfo()->simdWidth_ * 4; // threads/wave
|
|
bufferSize *= 256 * 4; // maximum number of waves
|
|
|
|
bufferSize *= numShaderEngines;
|
|
|
|
newInfos[i].address = AMU_ABI_SET_BUFFER_FOR_UAV_RET_BUFFER0 + j;
|
|
newInfos[i].value = bufferSize;
|
|
i++;
|
|
}
|
|
newInfos[i].address = AMU_ABI_GLOBAL_RETURN_BUFFER;
|
|
newInfos[i].value = true;
|
|
i++;
|
|
// Always use extended caching with global return buffer
|
|
newInfos[i].address = AMU_ABI_EXTENDED_CACHING;
|
|
newInfos[i].value = true;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_NUM_GLOBAL_UAV;
|
|
newInfos[i].value = SC_R800_MAX_UAV;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_NUM_GLOBAL_RETURN_BUFFER;
|
|
newInfos[i].value = R800_GLOBAL_RTN_BUF_LAST;
|
|
i++;
|
|
{
|
|
unsigned int bufferSize = cShader->u32GlobalRtnBufSlot;
|
|
|
|
bufferSize *= 4; // convert from DWORDS to bytes
|
|
|
|
//
|
|
// multiply by the maximum number of threads in flight at one time
|
|
//
|
|
// 256 waves * 64 threads/wave * 2 shader engines (for 870)
|
|
//
|
|
bufferSize *= nullDev().hwInfo()->simdWidth_ * 4; // threads/wave
|
|
bufferSize *= 256 * 4; // maximum number of waves
|
|
|
|
bufferSize *= numShaderEngines;
|
|
|
|
newInfos[i].address = AMU_ABI_GLOBAL_RETURN_BUFFER_SIZE;
|
|
newInfos[i].value = bufferSize;
|
|
i++;
|
|
}
|
|
{
|
|
unsigned int bufferSize = cShader->u32GlobalRtnBufSlotShort;
|
|
|
|
bufferSize *= 4; // convert from DWORDS to bytes
|
|
|
|
//
|
|
// multiply by the maximum number of threads in flight at one time
|
|
//
|
|
// 256 waves * 64 threads/wave * 2 shader engines (for 870)
|
|
//
|
|
bufferSize *= nullDev().hwInfo()->simdWidth_ * 4; // threads/wave
|
|
bufferSize *= 256 * 4; // maximum number of waves
|
|
|
|
bufferSize *= numShaderEngines;
|
|
|
|
newInfos[i].address = AMU_ABI_GLOBAL_RETURN_BUFFER_SIZE_SHORT;
|
|
newInfos[i].value = bufferSize;
|
|
i++;
|
|
}
|
|
{
|
|
unsigned int bufferSize = cShader->u32GlobalRtnBufSlotByte;
|
|
|
|
bufferSize *= 4; // convert from DWORDS to bytes
|
|
|
|
//
|
|
// multiply by the maximum number of threads in flight at one time
|
|
//
|
|
// 256 waves * 64 threads/wave * 2 shader engines (for 870)
|
|
//
|
|
bufferSize *= nullDev().hwInfo()->simdWidth_ * 4; // threads/wave
|
|
bufferSize *= 256 * 4; // maximum number of waves
|
|
|
|
bufferSize *= numShaderEngines;
|
|
|
|
newInfos[i].address = AMU_ABI_GLOBAL_RETURN_BUFFER_SIZE_BYTE;
|
|
newInfos[i].value = bufferSize;
|
|
i++;
|
|
}
|
|
for (unsigned int j = 0; j < SC_R800_MAX_UAV; j++)
|
|
{
|
|
newInfos[i].address = AMU_ABI_OFFSET_TO_UAV0+j;
|
|
newInfos[i].value = j;
|
|
i++;
|
|
}
|
|
for (unsigned int j = 0; j < SC_R800_MAX_UAV; j++)
|
|
{
|
|
// Set up UAV->fetch constant mapping for uncached
|
|
newInfos[i].address = AMU_ABI_UNCACHED_FETCH_CONST_UAV0+j;
|
|
newInfos[i].value = UncachedFetchConst[j];
|
|
i++;
|
|
}
|
|
for (unsigned int j = 0; j < SC_R800_MAX_UAV; j++)
|
|
{
|
|
newInfos[i].address = AMU_ABI_CACHED_FETCH_CONST_UAV0+j;
|
|
newInfos[i].value = CachedFetchConst[j];
|
|
i++;
|
|
}
|
|
for (unsigned int j = 0; j < R800_GLOBAL_RTN_BUF_LAST; j++)
|
|
{
|
|
newInfos[i].address = AMU_ABI_GLOBAL_RETURN_FETCH_CONST0+j;
|
|
newInfos[i].value = GlobalReturnFetchConst[j];
|
|
i++;
|
|
}
|
|
for (unsigned int j = 0; j < R800_GLOBAL_RTN_BUF_LAST; j++)
|
|
{
|
|
newInfos[i].address = AMU_ABI_GLOBAL_RETURN_BUFFER_TYPE0+j;
|
|
newInfos[i].value = GlobalReturnBufferType[j];
|
|
i++;
|
|
}
|
|
for (unsigned int j = 0; j < SC_R800_MAX_UAV; j++)
|
|
{
|
|
newInfos[i].address = AMU_ABI_CACHED_FETCH_CONST64_UAV0+j;
|
|
newInfos[i].value = CachedFetchConst64[j];
|
|
i++;
|
|
}
|
|
for (unsigned int j = 0; j < SC_R800_MAX_UAV; j++)
|
|
{
|
|
newInfos[i].address = AMU_ABI_CACHED_FETCH_CONST128_UAV0+j;
|
|
newInfos[i].value = CachedFetchConst128[j];
|
|
i++;
|
|
}
|
|
|
|
assert(i == infoCount);
|
|
encoding.progInfosCount = infoCount;
|
|
|
|
encoding.uavMask.mask[0] = cShader->u32RatOpIsUsed;
|
|
encoding.textData = HWSHADER_Get(cShader, hShaderMemHandle);
|
|
encoding.textSize = cShader->CodeLenInByte;
|
|
instructionCnt_ = encoding.textSize / sizeof(uint32_t);
|
|
encoding.scratchRegisterCount = cShader->MaxScratchRegsNeeded;
|
|
|
|
uint bufferSize = 0;
|
|
bufferSize = cShader->u32GlobalRtnBufSlot +
|
|
cShader->u32GlobalRtnBufSlotShort + cShader->u32GlobalRtnBufSlotByte;
|
|
bufferSize *= 4; // convert from DWORDS to bytes
|
|
|
|
//
|
|
// multiply by the maximum number of threads in flight at one time
|
|
//
|
|
// 256 waves * 64 threads/wave * 2 shader engines (for 870)
|
|
//
|
|
bufferSize *= nullDev().hwInfo()->simdWidth_ * 4; // threads/wave
|
|
bufferSize *= 256 * 4; // maximum number of waves
|
|
|
|
bufferSize *= numShaderEngines;
|
|
encoding.UAVReturnBufferTotalSize = bufferSize;
|
|
|
|
return true;
|
|
}
|
|
|
|
} // namespace gpu
|
|
|