// // Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved. // #include "device/gpu/gpudefs.hpp" #include "device/gpu/gpuprogram.hpp" #include "device/gpu/gpukernel.hpp" #include "acl.h" #include "SCShadersSi.h" #include "si_ci_vi_merged_offset.h" #include "si_ci_vi_merged_registers.h" #include #include #include #include #include namespace gpu { bool NullKernel::siCreateHwInfo(const void* shader, AMUabiAddEncoding& encoding) { static const uint NumSiCsInfos = (70 + 5 + 1 + 32 + 6); CALProgramInfoEntry* newInfos; uint i = 0; uint infoCount = NumSiCsInfos; const SC_SI_HWSHADER_CS* cShader = reinterpret_cast(shader); newInfos = new CALProgramInfoEntry[infoCount]; encoding.progInfos = newInfos; if (encoding.progInfos == 0) { infoCount = 0; return false; } newInfos[i].address = AMU_ABI_USER_ELEMENT_COUNT; newInfos[i].value = cShader->common.userElementCount; i++; for (unsigned int j = 0; j < cShader->common.userElementCount; j++) { newInfos[i].address = AMU_ABI_USER_ELEMENTS_0_DWORD0 + 4*j; newInfos[i].value = HWSHADER_Get(cShader, common.pUserElements)[j].dataClass; i++; newInfos[i].address = AMU_ABI_USER_ELEMENTS_0_DWORD1 + 4*j; newInfos[i].value = HWSHADER_Get(cShader, common.pUserElements)[j].apiSlot; i++; newInfos[i].address = AMU_ABI_USER_ELEMENTS_0_DWORD2 + 4*j; newInfos[i].value = HWSHADER_Get(cShader, common.pUserElements)[j].startUserReg; i++; newInfos[i].address = AMU_ABI_USER_ELEMENTS_0_DWORD3 + 4*j; newInfos[i].value = HWSHADER_Get(cShader, common.pUserElements)[j].userRegCount; i++; } newInfos[i].address = AMU_ABI_SI_NUM_VGPRS; newInfos[i].value = cShader->common.numVgprs; i++; newInfos[i].address = AMU_ABI_SI_NUM_SGPRS; newInfos[i].value = cShader->common.numSgprs; i++; newInfos[i].address = AMU_ABI_SI_NUM_SGPRS_AVAIL; newInfos[i].value = SI_sgprs_avail; //512;//options.NumSGPRsAvailable; i++; newInfos[i].address = AMU_ABI_SI_NUM_VGPRS_AVAIL; newInfos[i].value = SI_vgprs_avail;//options.NumVGPRsAvailable; i++; newInfos[i].address = AMU_ABI_SI_FLOAT_MODE; newInfos[i].value = cShader->common.floatMode; i++; newInfos[i].address = AMU_ABI_SI_IEEE_MODE; newInfos[i].value = cShader->common.bIeeeMode; i++; newInfos[i].address = AMU_ABI_SI_SCRATCH_SIZE; newInfos[i].value = cShader->common.scratchSize;; i++; newInfos[i].address = mmCOMPUTE_PGM_RSRC2; newInfos[i].value = cShader->computePgmRsrc2.u32All; i++; newInfos[i].address = AMU_ABI_NUM_THREAD_PER_GROUP_X; newInfos[i].value = cShader->numThreadX; i++; newInfos[i].address = AMU_ABI_NUM_THREAD_PER_GROUP_Y; newInfos[i].value = cShader->numThreadY; i++; newInfos[i].address = AMU_ABI_NUM_THREAD_PER_GROUP_Z; newInfos[i].value = cShader->numThreadZ; i++; newInfos[i].address = AMU_ABI_ORDERED_APPEND_ENABLE; newInfos[i].value = cShader->bOrderedAppendEnable; i++; newInfos[i].address = AMU_ABI_RAT_OP_IS_USED; newInfos[i].value = cShader->common.uavResourceUsage[0]; i++; for (unsigned int j = 0; j < ((SC_MAX_UAV + 31) / 32); j++) { newInfos[i].address = AMU_ABI_UAV_RESOURCE_MASK_0 + j; newInfos[i].value = cShader->common.uavResourceUsage[j]; i++; } newInfos[i].address = AMU_ABI_NUM_WAVEFRONT_PER_SIMD; // Setting the same as for scWrapR800Info newInfos[i].value = 1; i++; newInfos[i].address = AMU_ABI_WAVEFRONT_SIZE; newInfos[i].value = nullDev().hwInfo()->simdWidth_ * 4; //options.WavefrontSize; i++; newInfos[i].address = AMU_ABI_LDS_SIZE_AVAIL; newInfos[i].value = SI_ldssize_avail; //options.LDSSize; i++; COMPUTE_PGM_RSRC2 computePgmRsrc2; computePgmRsrc2.u32All = cShader->computePgmRsrc2.u32All; newInfos[i].address = AMU_ABI_LDS_SIZE_USED; newInfos[i].value = 64 * 4 * computePgmRsrc2.bits.LDS_SIZE; i++; infoCount = i; assert((i + 4 * (16 - cShader->common.userElementCount)) == NumSiCsInfos); encoding.progInfosCount = infoCount; CALUavMask uavMask; memcpy(uavMask.mask, cShader->common.uavResourceUsage, sizeof(CALUavMask)); encoding.uavMask = uavMask; encoding.textData = HWSHADER_Get(cShader, common.hShaderMemHandle); encoding.textSize = cShader->common.codeLenInByte; instructionCnt_ = encoding.textSize / sizeof(uint32_t); encoding.scratchRegisterCount = cShader->common.scratchSize; encoding.UAVReturnBufferTotalSize = 0; return true; } bool HSAILKernel::aqlCreateHWInfo(const void* shader, size_t shaderSize) { // Copy the shader_isa into a buffer hwMetaData_ = new char[shaderSize]; if (hwMetaData_ == NULL) { return false; } memcpy(hwMetaData_, shader, shaderSize); SC_SI_HWSHADER_CS* siMetaData = reinterpret_cast(hwMetaData_); // Code to patch the pointers in the shader object. // Must be preferably done in the compiler library size_t offset = siMetaData->common.uSizeInBytes; if (siMetaData->common.u32PvtDataSizeInBytes > 0) { siMetaData->common.pPvtData = reinterpret_cast( reinterpret_cast(siMetaData) + offset); offset += siMetaData->common.u32PvtDataSizeInBytes; } if (siMetaData->common.codeLenInByte > 0) { siMetaData->common.hShaderMemHandle = reinterpret_cast(siMetaData) + offset; offset += siMetaData->common.codeLenInByte; } char* headerBaseAddress = reinterpret_cast(siMetaData->common.hShaderMemHandle); hsa_ext_code_descriptor_t* hcd = reinterpret_cast(headerBaseAddress); amd_kernel_code_t* akc = reinterpret_cast( headerBaseAddress + hcd->code.handle); address codeStartAddress = reinterpret_cast
(akc); address codeEndAddress = reinterpret_cast
(hcd) + siMetaData->common.codeLenInByte; codeSize_ = codeEndAddress - codeStartAddress; code_ = new gpu::Memory(dev(), amd::alignUp(codeSize_, gpu::ConstBuffer::VectorSize)); // force to use remote memory for HW DEBUG Resource::MemoryType resMemType = (!dev().settings().enableHwDebug_) ? Resource::Local : Resource::RemoteUSWC; // Initialize kernel ISA code if ((code_ != NULL) && code_->create(resMemType)) { address cpuCodePtr = static_cast
(code_->map(NULL, Resource::WriteOnly)); // Copy only amd_kernel_code_t memcpy(cpuCodePtr, codeStartAddress, codeSize_); code_->unmap(NULL); } else { LogError("Failed to allocate ISA code!"); return false; } cpuAqlCode_ = akc; assert((akc->workitem_private_segment_byte_size & 3) == 0 && "Scratch must be DWORD aligned"); workGroupInfo_.scratchRegs_ = amd::alignUp(akc->workitem_private_segment_byte_size, 16) / sizeof(uint); workGroupInfo_.availableSGPRs_ = dev().gslCtx()->getNumSGPRsAvailable(); workGroupInfo_.availableVGPRs_ = dev().gslCtx()->getNumVGPRsAvailable(); workGroupInfo_.preferredSizeMultiple_ = dev().getAttribs().wavefrontSize; workGroupInfo_.privateMemSize_ = akc->workitem_private_segment_byte_size; workGroupInfo_.localMemSize_ = workGroupInfo_.usedLDSSize_ = akc->workgroup_group_segment_byte_size; workGroupInfo_.usedSGPRs_ = akc->wavefront_sgpr_count; workGroupInfo_.usedStackSize_ = 0; workGroupInfo_.usedVGPRs_ = akc->workitem_vgpr_count; workGroupInfo_.wavefrontPerSIMD_ = dev().getAttribs().wavefrontSize; return true; } } // namespace gpu