647aba6ed2
ECR #399840 - re-checkin of CL1109955 with the fix of OpenCL sanity check timeout (hw debug flag initialization) Affected files ... ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_debugger_amd.cpp#4 edit ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_debugger_amd.h#4 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/device.cpp#174 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#238 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudebugger.hpp#3 add ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudebugmanager.cpp#3 add ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudebugmanager.hpp#3 add ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#490 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.hpp#137 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#275 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#106 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuresource.cpp#200 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuscsi.cpp#30 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusettings.cpp#297 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#346 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.hpp#124 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp#69 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.h#42 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hwdebug.cpp#3 add ... //depot/stg/opencl/drivers/opencl/runtime/device/hwdebug.hpp#4 edit ... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#223 edit
212 строки
7.7 KiB
C++
212 строки
7.7 KiB
C++
//
|
|
// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
|
|
//
|
|
|
|
#include "device/gpu/gpudefs.hpp"
|
|
#include "device/gpu/gpuprogram.hpp"
|
|
#include "device/gpu/gpukernel.hpp"
|
|
#include "acl.h"
|
|
#include "SCShadersSi.h"
|
|
#include "si_ci_vi_merged_offset.h"
|
|
#include "si_ci_vi_merged_registers.h"
|
|
#include <string>
|
|
#include <fstream>
|
|
#include <sstream>
|
|
#include <iostream>
|
|
#include <ctime>
|
|
|
|
namespace gpu {
|
|
|
|
bool
|
|
NullKernel::siCreateHwInfo(const void* shader, AMUabiAddEncoding& encoding)
|
|
{
|
|
static const uint NumSiCsInfos = (70 + 5 + 1 + 32 + 6);
|
|
CALProgramInfoEntry* newInfos;
|
|
uint i = 0;
|
|
uint infoCount = NumSiCsInfos;
|
|
const SC_SI_HWSHADER_CS* cShader = reinterpret_cast<const SC_SI_HWSHADER_CS*>(shader);
|
|
newInfos = new CALProgramInfoEntry[infoCount];
|
|
encoding.progInfos = newInfos;
|
|
if (encoding.progInfos == 0) {
|
|
infoCount = 0;
|
|
return false;
|
|
}
|
|
newInfos[i].address = AMU_ABI_USER_ELEMENT_COUNT;
|
|
newInfos[i].value = cShader->common.userElementCount;
|
|
i++;
|
|
for (unsigned int j = 0; j < cShader->common.userElementCount; j++) {
|
|
newInfos[i].address = AMU_ABI_USER_ELEMENTS_0_DWORD0 + 4*j;
|
|
newInfos[i].value = HWSHADER_Get(cShader, common.pUserElements)[j].dataClass;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_USER_ELEMENTS_0_DWORD1 + 4*j;
|
|
newInfos[i].value = HWSHADER_Get(cShader, common.pUserElements)[j].apiSlot;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_USER_ELEMENTS_0_DWORD2 + 4*j;
|
|
newInfos[i].value = HWSHADER_Get(cShader, common.pUserElements)[j].startUserReg;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_USER_ELEMENTS_0_DWORD3 + 4*j;
|
|
newInfos[i].value = HWSHADER_Get(cShader, common.pUserElements)[j].userRegCount;
|
|
i++;
|
|
}
|
|
|
|
newInfos[i].address = AMU_ABI_SI_NUM_VGPRS;
|
|
newInfos[i].value = cShader->common.numVgprs;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_SI_NUM_SGPRS;
|
|
newInfos[i].value = cShader->common.numSgprs;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_SI_NUM_SGPRS_AVAIL;
|
|
newInfos[i].value = SI_sgprs_avail; //512;//options.NumSGPRsAvailable;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_SI_NUM_VGPRS_AVAIL;
|
|
newInfos[i].value = SI_vgprs_avail;//options.NumVGPRsAvailable;
|
|
i++;
|
|
|
|
newInfos[i].address = AMU_ABI_SI_FLOAT_MODE;
|
|
newInfos[i].value = cShader->common.floatMode;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_SI_IEEE_MODE;
|
|
newInfos[i].value = cShader->common.bIeeeMode;
|
|
i++;
|
|
|
|
newInfos[i].address = AMU_ABI_SI_SCRATCH_SIZE;
|
|
newInfos[i].value = cShader->common.scratchSize;;
|
|
i++;
|
|
|
|
newInfos[i].address = mmCOMPUTE_PGM_RSRC2;
|
|
newInfos[i].value = cShader->computePgmRsrc2.u32All;
|
|
i++;
|
|
|
|
newInfos[i].address = AMU_ABI_NUM_THREAD_PER_GROUP_X;
|
|
newInfos[i].value = cShader->numThreadX;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_NUM_THREAD_PER_GROUP_Y;
|
|
newInfos[i].value = cShader->numThreadY;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_NUM_THREAD_PER_GROUP_Z;
|
|
newInfos[i].value = cShader->numThreadZ;
|
|
i++;
|
|
|
|
newInfos[i].address = AMU_ABI_ORDERED_APPEND_ENABLE;
|
|
newInfos[i].value = cShader->bOrderedAppendEnable;
|
|
i++;
|
|
|
|
newInfos[i].address = AMU_ABI_RAT_OP_IS_USED;
|
|
newInfos[i].value = cShader->common.uavResourceUsage[0];
|
|
i++;
|
|
|
|
for (unsigned int j = 0; j < ((SC_MAX_UAV + 31) / 32); j++) {
|
|
newInfos[i].address = AMU_ABI_UAV_RESOURCE_MASK_0 + j;
|
|
newInfos[i].value = cShader->common.uavResourceUsage[j];
|
|
i++;
|
|
}
|
|
|
|
newInfos[i].address = AMU_ABI_NUM_WAVEFRONT_PER_SIMD; // Setting the same as for scWrapR800Info
|
|
newInfos[i].value = 1;
|
|
i++;
|
|
|
|
newInfos[i].address = AMU_ABI_WAVEFRONT_SIZE;
|
|
newInfos[i].value = nullDev().hwInfo()->simdWidth_ * 4; //options.WavefrontSize;
|
|
i++;
|
|
|
|
newInfos[i].address = AMU_ABI_LDS_SIZE_AVAIL;
|
|
newInfos[i].value = SI_ldssize_avail; //options.LDSSize;
|
|
i++;
|
|
|
|
COMPUTE_PGM_RSRC2 computePgmRsrc2;
|
|
computePgmRsrc2.u32All = cShader->computePgmRsrc2.u32All;
|
|
|
|
newInfos[i].address = AMU_ABI_LDS_SIZE_USED;
|
|
newInfos[i].value = 64 * 4 * computePgmRsrc2.bits.LDS_SIZE;
|
|
i++;
|
|
|
|
infoCount = i;
|
|
assert((i + 4 * (16 - cShader->common.userElementCount)) == NumSiCsInfos);
|
|
encoding.progInfosCount = infoCount;
|
|
|
|
CALUavMask uavMask;
|
|
memcpy(uavMask.mask, cShader->common.uavResourceUsage, sizeof(CALUavMask));
|
|
encoding.uavMask = uavMask;
|
|
encoding.textData = HWSHADER_Get(cShader, common.hShaderMemHandle);
|
|
encoding.textSize = cShader->common.codeLenInByte;
|
|
instructionCnt_ = encoding.textSize / sizeof(uint32_t);
|
|
encoding.scratchRegisterCount = cShader->common.scratchSize;
|
|
encoding.UAVReturnBufferTotalSize = 0;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
HSAILKernel::aqlCreateHWInfo(const void* shader, size_t shaderSize)
|
|
{
|
|
// Copy the shader_isa into a buffer
|
|
hwMetaData_ = new char[shaderSize];
|
|
if (hwMetaData_ == NULL) {
|
|
return false;
|
|
}
|
|
memcpy(hwMetaData_, shader, shaderSize);
|
|
|
|
SC_SI_HWSHADER_CS* siMetaData = reinterpret_cast<SC_SI_HWSHADER_CS*>(hwMetaData_);
|
|
|
|
// Code to patch the pointers in the shader object.
|
|
// Must be preferably done in the compiler library
|
|
size_t offset = siMetaData->common.uSizeInBytes;
|
|
if (siMetaData->common.u32PvtDataSizeInBytes > 0) {
|
|
siMetaData->common.pPvtData =
|
|
reinterpret_cast<SC_BYTE *>(
|
|
reinterpret_cast<char *>(siMetaData) + offset);
|
|
offset += siMetaData->common.u32PvtDataSizeInBytes;
|
|
}
|
|
if (siMetaData->common.codeLenInByte > 0) {
|
|
siMetaData->common.hShaderMemHandle =
|
|
reinterpret_cast<char *>(siMetaData) + offset;
|
|
offset += siMetaData->common.codeLenInByte;
|
|
}
|
|
|
|
char* headerBaseAddress =
|
|
reinterpret_cast<char*>(siMetaData->common.hShaderMemHandle);
|
|
hsa_ext_code_descriptor_t* hcd =
|
|
reinterpret_cast<hsa_ext_code_descriptor_t*>(headerBaseAddress);
|
|
amd_kernel_code_t* akc = reinterpret_cast<amd_kernel_code_t*>(
|
|
headerBaseAddress + hcd->code.handle);
|
|
|
|
address codeStartAddress = reinterpret_cast<address>(akc);
|
|
address codeEndAddress = reinterpret_cast<address>(hcd) + siMetaData->common.codeLenInByte;
|
|
codeSize_ = codeEndAddress - codeStartAddress;
|
|
code_ = new gpu::Memory(dev(), amd::alignUp(codeSize_, gpu::ConstBuffer::VectorSize));
|
|
|
|
// force to use remote memory for HW DEBUG
|
|
Resource::MemoryType resMemType = (!dev().settings().enableHwDebug_) ? Resource::Local : Resource::RemoteUSWC;
|
|
|
|
// Initialize kernel ISA code
|
|
if ((code_ != NULL) && code_->create(resMemType)) {
|
|
address cpuCodePtr = static_cast<address>(code_->map(NULL, Resource::WriteOnly));
|
|
// Copy only amd_kernel_code_t
|
|
memcpy(cpuCodePtr, codeStartAddress, codeSize_);
|
|
code_->unmap(NULL);
|
|
}
|
|
else {
|
|
LogError("Failed to allocate ISA code!");
|
|
return false;
|
|
}
|
|
cpuAqlCode_ = akc;
|
|
|
|
assert((akc->workitem_private_segment_byte_size & 3) == 0 &&
|
|
"Scratch must be DWORD aligned");
|
|
workGroupInfo_.scratchRegs_ =
|
|
amd::alignUp(akc->workitem_private_segment_byte_size, 16) / sizeof(uint);
|
|
workGroupInfo_.availableSGPRs_ = dev().gslCtx()->getNumSGPRsAvailable();
|
|
workGroupInfo_.availableVGPRs_ = dev().gslCtx()->getNumVGPRsAvailable();
|
|
workGroupInfo_.preferredSizeMultiple_ = dev().getAttribs().wavefrontSize;
|
|
workGroupInfo_.privateMemSize_ = akc->workitem_private_segment_byte_size;
|
|
workGroupInfo_.localMemSize_ =
|
|
workGroupInfo_.usedLDSSize_ = akc->workgroup_group_segment_byte_size;
|
|
workGroupInfo_.usedSGPRs_ = akc->wavefront_sgpr_count;
|
|
workGroupInfo_.usedStackSize_ = 0;
|
|
workGroupInfo_.usedVGPRs_ = akc->workitem_vgpr_count;
|
|
workGroupInfo_.wavefrontPerSIMD_ = dev().getAttribs().wavefrontSize;
|
|
|
|
return true;
|
|
}
|
|
} // namespace gpu
|