8cc3f47661
ECR #333753 - ORCA RT/Compiler Lib: HSA Code Object/RT independent loader introducing/integration into OpenCL. Changes by Evgeniy Mankov. Purpose: Use the same Finalizer & loader for both HSA & ORCA RT. AMDIL path is not affected. Changes: 1. The whole BRIG is finalized now instead of per kernel finalization (both in gpuprogram & hsail_be). 2. HSALoader is changed in order to work with CodeObject and new HSA Loader's API <96> Context. Now it is in ORCA<92>s gpuprogram instead of Compiler Lib. 3. brig_loader.cpp is removed from compiler lib, as well as __aclHSALoader function exports from the whole stack. 4. BIF .text section now contains the whole finalized HSA CodeObject instead of separate symbols for finalized kernels. 5. ORCA RT now works directly with amd_kernel_code_t and doesn't need any SC metadata anymore. 6. aoc2 is supplemented with fake offline loader correspondingly. 7. amdocl/complib make sytem changes. 8. test_driver.pl update. ToDo: 1. Implement disassemble() & BuildLog() functions to support ISA dumping & SC error handling (Konstantin). 2. Global variables initialization by pragma reference (Konstantin). Test to verify: test_basic progvar_prog_scope_init. 3. Code Object without kernels support (Nikolay - ready). Test to verify: test_generic_address_space.exe library_function testing: windows smoke, pre check-in, ocl conformance 2.0, ocl SDK 2.9 Reviewers: Nikolay Haustov, German Andryeyev Affected files ... ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/amdocl.def.in#13 edit ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/amdocl.map.in#15 edit ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/build/Makefile.api#116 edit ... //depot/stg/opencl/drivers/opencl/compiler/legacy-lib/amdoclcl.def.in#2 edit ... //depot/stg/opencl/drivers/opencl/compiler/legacy-lib/amdoclcl.map.in#2 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/amdoclcl.def.in#12 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/amdoclcl.map.in#11 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/backends/common/v0_8/if_acl.cpp#70 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/backends/gpu/build/Makefile.gpu#32 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/backends/gpu/hsail_be.cpp#44 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/build/Makefile.complib#85 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/utils/v0_8/libUtils.cpp#9 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/utils/v0_8/libUtils.h#18 edit ... //depot/stg/opencl/drivers/opencl/compiler/tools/aoc2/aoc2.cpp#70 edit ... //depot/stg/opencl/drivers/opencl/compiler/tools/aoc2/build/Makefile.aoc2#24 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#248 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudefs.hpp#121 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#288 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#112 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuprogram.cpp#194 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuprogram.hpp#59 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuscsi.cpp#33 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#368 edit ... //depot/stg/opencl/drivers/opencl/tests/hsa/bin/test_driver.pl#12 edit
190 wiersze
6.7 KiB
C++
190 wiersze
6.7 KiB
C++
//
|
|
// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
|
|
//
|
|
|
|
#include "device/gpu/gpudefs.hpp"
|
|
#include "device/gpu/gpuprogram.hpp"
|
|
#include "device/gpu/gpukernel.hpp"
|
|
#include "acl.h"
|
|
#include "SCShadersSi.h"
|
|
#include "si_ci_vi_merged_offset.h"
|
|
#include "si_ci_vi_merged_registers.h"
|
|
#include <string>
|
|
#include <fstream>
|
|
#include <sstream>
|
|
#include <iostream>
|
|
#include <ctime>
|
|
#include "amd_hsa_loader.hpp"
|
|
|
|
namespace gpu {
|
|
|
|
bool
|
|
NullKernel::siCreateHwInfo(const void* shader, AMUabiAddEncoding& encoding)
|
|
{
|
|
static const uint NumSiCsInfos = (70 + 5 + 1 + 32 + 6);
|
|
CALProgramInfoEntry* newInfos;
|
|
uint i = 0;
|
|
uint infoCount = NumSiCsInfos;
|
|
const SC_SI_HWSHADER_CS* cShader = reinterpret_cast<const SC_SI_HWSHADER_CS*>(shader);
|
|
newInfos = new CALProgramInfoEntry[infoCount];
|
|
encoding.progInfos = newInfos;
|
|
if (encoding.progInfos == 0) {
|
|
infoCount = 0;
|
|
return false;
|
|
}
|
|
newInfos[i].address = AMU_ABI_USER_ELEMENT_COUNT;
|
|
newInfos[i].value = cShader->common.userElementCount;
|
|
i++;
|
|
for (unsigned int j = 0; j < cShader->common.userElementCount; j++) {
|
|
newInfos[i].address = AMU_ABI_USER_ELEMENTS_0_DWORD0 + 4*j;
|
|
newInfos[i].value = HWSHADER_Get(cShader, common.pUserElements)[j].dataClass;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_USER_ELEMENTS_0_DWORD1 + 4*j;
|
|
newInfos[i].value = HWSHADER_Get(cShader, common.pUserElements)[j].apiSlot;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_USER_ELEMENTS_0_DWORD2 + 4*j;
|
|
newInfos[i].value = HWSHADER_Get(cShader, common.pUserElements)[j].startUserReg;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_USER_ELEMENTS_0_DWORD3 + 4*j;
|
|
newInfos[i].value = HWSHADER_Get(cShader, common.pUserElements)[j].userRegCount;
|
|
i++;
|
|
}
|
|
|
|
newInfos[i].address = AMU_ABI_SI_NUM_VGPRS;
|
|
newInfos[i].value = cShader->common.numVgprs;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_SI_NUM_SGPRS;
|
|
newInfos[i].value = cShader->common.numSgprs;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_SI_NUM_SGPRS_AVAIL;
|
|
newInfos[i].value = SI_sgprs_avail; //512;//options.NumSGPRsAvailable;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_SI_NUM_VGPRS_AVAIL;
|
|
newInfos[i].value = SI_vgprs_avail;//options.NumVGPRsAvailable;
|
|
i++;
|
|
|
|
newInfos[i].address = AMU_ABI_SI_FLOAT_MODE;
|
|
newInfos[i].value = cShader->common.floatMode;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_SI_IEEE_MODE;
|
|
newInfos[i].value = cShader->common.bIeeeMode;
|
|
i++;
|
|
|
|
newInfos[i].address = AMU_ABI_SI_SCRATCH_SIZE;
|
|
newInfos[i].value = cShader->common.scratchSize;;
|
|
i++;
|
|
|
|
newInfos[i].address = mmCOMPUTE_PGM_RSRC2;
|
|
newInfos[i].value = cShader->computePgmRsrc2.u32All;
|
|
i++;
|
|
|
|
newInfos[i].address = AMU_ABI_NUM_THREAD_PER_GROUP_X;
|
|
newInfos[i].value = cShader->numThreadX;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_NUM_THREAD_PER_GROUP_Y;
|
|
newInfos[i].value = cShader->numThreadY;
|
|
i++;
|
|
newInfos[i].address = AMU_ABI_NUM_THREAD_PER_GROUP_Z;
|
|
newInfos[i].value = cShader->numThreadZ;
|
|
i++;
|
|
|
|
newInfos[i].address = AMU_ABI_ORDERED_APPEND_ENABLE;
|
|
newInfos[i].value = cShader->bOrderedAppendEnable;
|
|
i++;
|
|
|
|
newInfos[i].address = AMU_ABI_RAT_OP_IS_USED;
|
|
newInfos[i].value = cShader->common.uavResourceUsage[0];
|
|
i++;
|
|
|
|
for (unsigned int j = 0; j < ((SC_MAX_UAV + 31) / 32); j++) {
|
|
newInfos[i].address = AMU_ABI_UAV_RESOURCE_MASK_0 + j;
|
|
newInfos[i].value = cShader->common.uavResourceUsage[j];
|
|
i++;
|
|
}
|
|
|
|
newInfos[i].address = AMU_ABI_NUM_WAVEFRONT_PER_SIMD; // Setting the same as for scWrapR800Info
|
|
newInfos[i].value = 1;
|
|
i++;
|
|
|
|
newInfos[i].address = AMU_ABI_WAVEFRONT_SIZE;
|
|
newInfos[i].value = nullDev().hwInfo()->simdWidth_ * 4; //options.WavefrontSize;
|
|
i++;
|
|
|
|
newInfos[i].address = AMU_ABI_LDS_SIZE_AVAIL;
|
|
newInfos[i].value = SI_ldssize_avail; //options.LDSSize;
|
|
i++;
|
|
|
|
COMPUTE_PGM_RSRC2 computePgmRsrc2;
|
|
computePgmRsrc2.u32All = cShader->computePgmRsrc2.u32All;
|
|
|
|
newInfos[i].address = AMU_ABI_LDS_SIZE_USED;
|
|
newInfos[i].value = 64 * 4 * computePgmRsrc2.bits.LDS_SIZE;
|
|
i++;
|
|
|
|
infoCount = i;
|
|
assert((i + 4 * (16 - cShader->common.userElementCount)) == NumSiCsInfos);
|
|
encoding.progInfosCount = infoCount;
|
|
|
|
CALUavMask uavMask;
|
|
memcpy(uavMask.mask, cShader->common.uavResourceUsage, sizeof(CALUavMask));
|
|
encoding.uavMask = uavMask;
|
|
encoding.textData = HWSHADER_Get(cShader, common.hShaderMemHandle);
|
|
encoding.textSize = cShader->common.codeLenInByte;
|
|
instructionCnt_ = encoding.textSize / sizeof(uint32_t);
|
|
encoding.scratchRegisterCount = cShader->common.scratchSize;
|
|
encoding.UAVReturnBufferTotalSize = 0;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol *sym)
|
|
{
|
|
if (!sym) {
|
|
return false;
|
|
}
|
|
uint64_t akc_addr = 0;
|
|
if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, reinterpret_cast<void*>(&akc_addr))) {
|
|
return false;
|
|
}
|
|
amd_kernel_code_t *akc = reinterpret_cast<amd_kernel_code_t*>(akc_addr);
|
|
cpuAqlCode_ = akc;
|
|
if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE, reinterpret_cast<void*>(&codeSize_))) {
|
|
return false;
|
|
}
|
|
size_t akc_align = 0;
|
|
if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_ALIGN, reinterpret_cast<void*>(&akc_align))) {
|
|
return false;
|
|
}
|
|
code_ = new gpu::Memory(dev(), amd::alignUp(codeSize_, akc_align));
|
|
// Initialize kernel ISA code
|
|
if (code_ && code_->create(Resource::Shader)) {
|
|
address cpuCodePtr = static_cast<address>(code_->map(NULL, Resource::WriteOnly));
|
|
// Copy only amd_kernel_code_t
|
|
memcpy(cpuCodePtr, reinterpret_cast<address>(akc), codeSize_);
|
|
code_->unmap(NULL);
|
|
}
|
|
else {
|
|
LogError("Failed to allocate ISA code!");
|
|
return false;
|
|
}
|
|
|
|
assert((akc->workitem_private_segment_byte_size & 3) == 0 &&
|
|
"Scratch must be DWORD aligned");
|
|
workGroupInfo_.scratchRegs_ =
|
|
amd::alignUp(akc->workitem_private_segment_byte_size, 16) / sizeof(uint);
|
|
workGroupInfo_.availableSGPRs_ = dev().gslCtx()->getNumSGPRsAvailable();
|
|
workGroupInfo_.availableVGPRs_ = dev().gslCtx()->getNumVGPRsAvailable();
|
|
workGroupInfo_.preferredSizeMultiple_ = dev().getAttribs().wavefrontSize;
|
|
workGroupInfo_.privateMemSize_ = akc->workitem_private_segment_byte_size;
|
|
workGroupInfo_.localMemSize_ =
|
|
workGroupInfo_.usedLDSSize_ = akc->workgroup_group_segment_byte_size;
|
|
workGroupInfo_.usedSGPRs_ = akc->wavefront_sgpr_count;
|
|
workGroupInfo_.usedStackSize_ = 0;
|
|
workGroupInfo_.usedVGPRs_ = akc->workitem_vgpr_count;
|
|
workGroupInfo_.wavefrontPerSIMD_ = dev().getAttribs().wavefrontSize;
|
|
|
|
return true;
|
|
}
|
|
} // namespace gpu
|