rocclr/runtime/device/gpu/gpuscsi.cpp

//
// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
//

#include "device/gpu/gpudefs.hpp"
#include "device/gpu/gpuprogram.hpp"
#include "device/gpu/gpukernel.hpp"
#include "acl.h"
#include "SCShadersSi.h"
#include "si_ci_vi_merged_offset.h"
#include "si_ci_vi_merged_registers.h"
#include <string>
#include <fstream>
#include <sstream>
#include <iostream>
#include <ctime>
#include "amd_hsa_loader.hpp"

namespace gpu {

bool NullKernel::siCreateHwInfo(const void* shader, AMUabiAddEncoding& encoding) {
  static const uint NumSiCsInfos = (70 + 5 + 1 + 32 + 6);
  CALProgramInfoEntry* newInfos;
  uint i = 0;
  uint infoCount = NumSiCsInfos;
  const SC_SI_HWSHADER_CS* cShader = reinterpret_cast<const SC_SI_HWSHADER_CS*>(shader);
  newInfos = new CALProgramInfoEntry[infoCount];
  encoding.progInfos = newInfos;
  if (encoding.progInfos == 0) {
    infoCount = 0;
    return false;
  }
  newInfos[i].address = AMU_ABI_USER_ELEMENT_COUNT;
  newInfos[i].value = cShader->common.userElementCount;
  i++;
  for (unsigned int j = 0; j < cShader->common.userElementCount; j++) {
    newInfos[i].address = AMU_ABI_USER_ELEMENTS_0_DWORD0 + 4 * j;
    newInfos[i].value = HWSHADER_Get(cShader, common.pUserElements)[j].dataClass;
    i++;
    newInfos[i].address = AMU_ABI_USER_ELEMENTS_0_DWORD1 + 4 * j;
    newInfos[i].value = HWSHADER_Get(cShader, common.pUserElements)[j].apiSlot;
    i++;
    newInfos[i].address = AMU_ABI_USER_ELEMENTS_0_DWORD2 + 4 * j;
    newInfos[i].value = HWSHADER_Get(cShader, common.pUserElements)[j].startUserReg;
    i++;
    newInfos[i].address = AMU_ABI_USER_ELEMENTS_0_DWORD3 + 4 * j;
    newInfos[i].value = HWSHADER_Get(cShader, common.pUserElements)[j].userRegCount;
    i++;
  }

  newInfos[i].address = AMU_ABI_SI_NUM_VGPRS;
  newInfos[i].value = cShader->common.numVgprs;
  i++;
  newInfos[i].address = AMU_ABI_SI_NUM_SGPRS;
  newInfos[i].value = cShader->common.numSgprs;
  i++;
  newInfos[i].address = AMU_ABI_SI_NUM_SGPRS_AVAIL;
  newInfos[i].value = SI_sgprs_avail;  // 512;//options.NumSGPRsAvailable;
  i++;
  newInfos[i].address = AMU_ABI_SI_NUM_VGPRS_AVAIL;
  newInfos[i].value = SI_vgprs_avail;  // options.NumVGPRsAvailable;
  i++;

  newInfos[i].address = AMU_ABI_SI_FLOAT_MODE;
  newInfos[i].value = cShader->common.floatMode;
  i++;
  newInfos[i].address = AMU_ABI_SI_IEEE_MODE;
  newInfos[i].value = cShader->common.bIeeeMode;
  i++;

  newInfos[i].address = AMU_ABI_SI_SCRATCH_SIZE;
  newInfos[i].value = cShader->common.scratchSize;
  ;
  i++;

  newInfos[i].address = mmCOMPUTE_PGM_RSRC2;
  newInfos[i].value = cShader->computePgmRsrc2.u32All;
  i++;

  newInfos[i].address = AMU_ABI_NUM_THREAD_PER_GROUP_X;
  newInfos[i].value = cShader->numThreadX;
  i++;
  newInfos[i].address = AMU_ABI_NUM_THREAD_PER_GROUP_Y;
  newInfos[i].value = cShader->numThreadY;
  i++;
  newInfos[i].address = AMU_ABI_NUM_THREAD_PER_GROUP_Z;
  newInfos[i].value = cShader->numThreadZ;
  i++;

  newInfos[i].address = AMU_ABI_ORDERED_APPEND_ENABLE;
  newInfos[i].value = cShader->bOrderedAppendEnable;
  i++;

  newInfos[i].address = AMU_ABI_RAT_OP_IS_USED;
  newInfos[i].value = cShader->common.uavResourceUsage[0];
  i++;

  for (unsigned int j = 0; j < ((SC_MAX_UAV + 31) / 32); j++) {
    newInfos[i].address = AMU_ABI_UAV_RESOURCE_MASK_0 + j;
    newInfos[i].value = cShader->common.uavResourceUsage[j];
    i++;
  }

  newInfos[i].address = AMU_ABI_NUM_WAVEFRONT_PER_SIMD;  // Setting the same as for scWrapR800Info
  newInfos[i].value = 1;
  i++;

  newInfos[i].address = AMU_ABI_WAVEFRONT_SIZE;
  newInfos[i].value = nullDev().hwInfo()->simdWidth_ * 4;  // options.WavefrontSize;
  i++;

  newInfos[i].address = AMU_ABI_LDS_SIZE_AVAIL;
  newInfos[i].value = SI_ldssize_avail;  // options.LDSSize;
  i++;

  COMPUTE_PGM_RSRC2 computePgmRsrc2;
  computePgmRsrc2.u32All = cShader->computePgmRsrc2.u32All;

  newInfos[i].address = AMU_ABI_LDS_SIZE_USED;
  newInfos[i].value = 64 * 4 * computePgmRsrc2.bits.LDS_SIZE;
  i++;

  infoCount = i;
  assert((i + 4 * (16 - cShader->common.userElementCount)) == NumSiCsInfos);
  encoding.progInfosCount = infoCount;

  encoding.textData = HWSHADER_Get(cShader, common.hShaderMemHandle);
  encoding.textSize = cShader->common.codeLenInByte;
  instructionCnt_ = encoding.textSize / sizeof(uint32_t);
  encoding.scratchRegisterCount = cShader->common.scratchSize;
  encoding.UAVReturnBufferTotalSize = 0;

  return true;
}

bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) {
  if (!sym) {
    return false;
  }
  uint64_t akc_addr = 0;
  if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, reinterpret_cast<void*>(&akc_addr))) {
    return false;
  }
  amd_kernel_code_t* akc = reinterpret_cast<amd_kernel_code_t*>(akc_addr);
  cpuAqlCode_ = akc;
  if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE,
                    reinterpret_cast<void*>(&codeSize_))) {
    return false;
  }
  size_t akc_align = 0;
  if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_ALIGN,
                    reinterpret_cast<void*>(&akc_align))) {
    return false;
  }

  // Allocate HW resources for the real program only
  if (!prog().isNull()) {
    code_ = new gpu::Memory(dev(), amd::alignUp(codeSize_, akc_align));
    // Initialize kernel ISA code
    if (code_ && code_->create(Resource::Shader)) {
      address cpuCodePtr = static_cast<address>(code_->map(NULL, Resource::WriteOnly));
      // Copy only amd_kernel_code_t
      memcpy(cpuCodePtr, reinterpret_cast<address>(akc), codeSize_);
      code_->unmap(NULL);
    } else {
      LogError("Failed to allocate ISA code!");
      return false;
    }
  }

  assert((akc->workitem_private_segment_byte_size & 3) == 0 && "Scratch must be DWORD aligned");
  workGroupInfo_.scratchRegs_ =
      amd::alignUp(akc->workitem_private_segment_byte_size, 16) / sizeof(uint);
  workGroupInfo_.privateMemSize_ = akc->workitem_private_segment_byte_size;
  workGroupInfo_.availableLDSSize_ = dev().info().localMemSize_;
  workGroupInfo_.localMemSize_ = workGroupInfo_.usedLDSSize_ =
      akc->workgroup_group_segment_byte_size;
  workGroupInfo_.usedSGPRs_ = akc->wavefront_sgpr_count;
  workGroupInfo_.usedStackSize_ = 0;
  workGroupInfo_.usedVGPRs_ = akc->workitem_vgpr_count;

  if (!prog().isNull()) {
    workGroupInfo_.availableSGPRs_ = dev().gslCtx()->getNumSGPRsAvailable();
    workGroupInfo_.availableVGPRs_ = dev().gslCtx()->getNumVGPRsAvailable();
    workGroupInfo_.preferredSizeMultiple_ = dev().getAttribs().wavefrontSize;
    workGroupInfo_.wavefrontPerSIMD_ = dev().getAttribs().wavefrontSize;
  } else {
    workGroupInfo_.availableSGPRs_ = 104;
    workGroupInfo_.availableVGPRs_ = 256;
    workGroupInfo_.preferredSizeMultiple_ = workGroupInfo_.wavefrontPerSIMD_ = 64;
  }
  return true;
}
}  // namespace gpu
initial commit 2014-07-04 16:17:05 -04:00			`//`
			`// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.`
			`//`

			`#include "device/gpu/gpudefs.hpp"`
			`#include "device/gpu/gpuprogram.hpp"`
			`#include "device/gpu/gpukernel.hpp"`
			`#include "acl.h"`
			`#include "SCShadersSi.h"`
P4 to Git Change 1084700 by ericz@fl_ericz3 on 2014/10/06 18:17:12 2014-10-06 18:33:20 -04:00			`#include "si_ci_vi_merged_offset.h"`
P4 to Git Change 1097741 by efinger@efinger_BDCW7-EFINGER on 2014/11/17 15:47:10 2014-11-17 15:56:59 -05:00			`#include "si_ci_vi_merged_registers.h"`
initial commit 2014-07-04 16:17:05 -04:00			`#include <string>`
			`#include <fstream>`
			`#include <sstream>`
			`#include <iostream>`
			`#include <ctime>`
P4 to Git Change 1170297 by nhaustov@nhaustov_hsa on 2015/07/14 05:36:10 2015-07-14 17:08:54 -04:00			`#include "amd_hsa_loader.hpp"`
initial commit 2014-07-04 16:17:05 -04:00
			`namespace gpu {`

P4 to Git Change 1398097 by lmoriche@lmoriche_opencl_dev2 on 2017/04/13 13:01:56 2017-04-13 13:56:38 -04:00			`bool NullKernel::siCreateHwInfo(const void* shader, AMUabiAddEncoding& encoding) {`
			`static const uint NumSiCsInfos = (70 + 5 + 1 + 32 + 6);`
			`CALProgramInfoEntry* newInfos;`
			`uint i = 0;`
			`uint infoCount = NumSiCsInfos;`
			`const SC_SI_HWSHADER_CS* cShader = reinterpret_cast<const SC_SI_HWSHADER_CS*>(shader);`
			`newInfos = new CALProgramInfoEntry[infoCount];`
			`encoding.progInfos = newInfos;`
			`if (encoding.progInfos == 0) {`
			`infoCount = 0;`
			`return false;`
			`}`
			`newInfos[i].address = AMU_ABI_USER_ELEMENT_COUNT;`
			`newInfos[i].value = cShader->common.userElementCount;`
			`i++;`
			`for (unsigned int j = 0; j < cShader->common.userElementCount; j++) {`
			`newInfos[i].address = AMU_ABI_USER_ELEMENTS_0_DWORD0 + 4 * j;`
			`newInfos[i].value = HWSHADER_Get(cShader, common.pUserElements)[j].dataClass;`
			`i++;`
			`newInfos[i].address = AMU_ABI_USER_ELEMENTS_0_DWORD1 + 4 * j;`
			`newInfos[i].value = HWSHADER_Get(cShader, common.pUserElements)[j].apiSlot;`
			`i++;`
			`newInfos[i].address = AMU_ABI_USER_ELEMENTS_0_DWORD2 + 4 * j;`
			`newInfos[i].value = HWSHADER_Get(cShader, common.pUserElements)[j].startUserReg;`
			`i++;`
			`newInfos[i].address = AMU_ABI_USER_ELEMENTS_0_DWORD3 + 4 * j;`
			`newInfos[i].value = HWSHADER_Get(cShader, common.pUserElements)[j].userRegCount;`
			`i++;`
			`}`

			`newInfos[i].address = AMU_ABI_SI_NUM_VGPRS;`
			`newInfos[i].value = cShader->common.numVgprs;`
			`i++;`
			`newInfos[i].address = AMU_ABI_SI_NUM_SGPRS;`
			`newInfos[i].value = cShader->common.numSgprs;`
			`i++;`
			`newInfos[i].address = AMU_ABI_SI_NUM_SGPRS_AVAIL;`
			`newInfos[i].value = SI_sgprs_avail; // 512;//options.NumSGPRsAvailable;`
			`i++;`
			`newInfos[i].address = AMU_ABI_SI_NUM_VGPRS_AVAIL;`
			`newInfos[i].value = SI_vgprs_avail; // options.NumVGPRsAvailable;`
			`i++;`

			`newInfos[i].address = AMU_ABI_SI_FLOAT_MODE;`
			`newInfos[i].value = cShader->common.floatMode;`
			`i++;`
			`newInfos[i].address = AMU_ABI_SI_IEEE_MODE;`
			`newInfos[i].value = cShader->common.bIeeeMode;`
			`i++;`

			`newInfos[i].address = AMU_ABI_SI_SCRATCH_SIZE;`
			`newInfos[i].value = cShader->common.scratchSize;`
			`;`
			`i++;`

			`newInfos[i].address = mmCOMPUTE_PGM_RSRC2;`
			`newInfos[i].value = cShader->computePgmRsrc2.u32All;`
			`i++;`

			`newInfos[i].address = AMU_ABI_NUM_THREAD_PER_GROUP_X;`
			`newInfos[i].value = cShader->numThreadX;`
			`i++;`
			`newInfos[i].address = AMU_ABI_NUM_THREAD_PER_GROUP_Y;`
			`newInfos[i].value = cShader->numThreadY;`
			`i++;`
			`newInfos[i].address = AMU_ABI_NUM_THREAD_PER_GROUP_Z;`
			`newInfos[i].value = cShader->numThreadZ;`
			`i++;`

			`newInfos[i].address = AMU_ABI_ORDERED_APPEND_ENABLE;`
			`newInfos[i].value = cShader->bOrderedAppendEnable;`
			`i++;`

			`newInfos[i].address = AMU_ABI_RAT_OP_IS_USED;`
			`newInfos[i].value = cShader->common.uavResourceUsage[0];`
			`i++;`

			`for (unsigned int j = 0; j < ((SC_MAX_UAV + 31) / 32); j++) {`
			`newInfos[i].address = AMU_ABI_UAV_RESOURCE_MASK_0 + j;`
			`newInfos[i].value = cShader->common.uavResourceUsage[j];`
			`i++;`
			`}`

			`newInfos[i].address = AMU_ABI_NUM_WAVEFRONT_PER_SIMD; // Setting the same as for scWrapR800Info`
			`newInfos[i].value = 1;`
			`i++;`

			`newInfos[i].address = AMU_ABI_WAVEFRONT_SIZE;`
			`newInfos[i].value = nullDev().hwInfo()->simdWidth_ * 4; // options.WavefrontSize;`
			`i++;`

			`newInfos[i].address = AMU_ABI_LDS_SIZE_AVAIL;`
			`newInfos[i].value = SI_ldssize_avail; // options.LDSSize;`
			`i++;`

			`COMPUTE_PGM_RSRC2 computePgmRsrc2;`
			`computePgmRsrc2.u32All = cShader->computePgmRsrc2.u32All;`

			`newInfos[i].address = AMU_ABI_LDS_SIZE_USED;`
			`newInfos[i].value = 64 * 4 * computePgmRsrc2.bits.LDS_SIZE;`
			`i++;`

			`infoCount = i;`
			`assert((i + 4 * (16 - cShader->common.userElementCount)) == NumSiCsInfos);`
			`encoding.progInfosCount = infoCount;`

			`encoding.textData = HWSHADER_Get(cShader, common.hShaderMemHandle);`
			`encoding.textSize = cShader->common.codeLenInByte;`
			`instructionCnt_ = encoding.textSize / sizeof(uint32_t);`
			`encoding.scratchRegisterCount = cShader->common.scratchSize;`
			`encoding.UAVReturnBufferTotalSize = 0;`

			`return true;`
initial commit 2014-07-04 16:17:05 -04:00			`}`

P4 to Git Change 1398097 by lmoriche@lmoriche_opencl_dev2 on 2017/04/13 13:01:56 2017-04-13 13:56:38 -04:00			`bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) {`
			`if (!sym) {`
			`return false;`
			`}`
			`uint64_t akc_addr = 0;`
			`if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, reinterpret_cast<void*>(&akc_addr))) {`
			`return false;`
			`}`
			`amd_kernel_code_t* akc = reinterpret_cast<amd_kernel_code_t*>(akc_addr);`
			`cpuAqlCode_ = akc;`
			`if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE,`
			`reinterpret_cast<void*>(&codeSize_))) {`
			`return false;`
			`}`
			`size_t akc_align = 0;`
			`if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_ALIGN,`
			`reinterpret_cast<void*>(&akc_align))) {`
			`return false;`
			`}`

			`// Allocate HW resources for the real program only`
			`if (!prog().isNull()) {`
			`code_ = new gpu::Memory(dev(), amd::alignUp(codeSize_, akc_align));`
			`// Initialize kernel ISA code`
			`if (code_ && code_->create(Resource::Shader)) {`
			`address cpuCodePtr = static_cast<address>(code_->map(NULL, Resource::WriteOnly));`
			`// Copy only amd_kernel_code_t`
			`memcpy(cpuCodePtr, reinterpret_cast<address>(akc), codeSize_);`
			`code_->unmap(NULL);`
			`} else {`
			`LogError("Failed to allocate ISA code!");`
			`return false;`
P4 to Git Change 1250949 by gandryey@gera-w8 on 2016/03/24 12:06:49 2016-03-24 12:15:44 -04:00			`}`
P4 to Git Change 1398097 by lmoriche@lmoriche_opencl_dev2 on 2017/04/13 13:01:56 2017-04-13 13:56:38 -04:00			`}`

			`assert((akc->workitem_private_segment_byte_size & 3) == 0 && "Scratch must be DWORD aligned");`
			`workGroupInfo_.scratchRegs_ =`
			`amd::alignUp(akc->workitem_private_segment_byte_size, 16) / sizeof(uint);`
			`workGroupInfo_.privateMemSize_ = akc->workitem_private_segment_byte_size;`
			`workGroupInfo_.availableLDSSize_ = dev().info().localMemSize_;`
			`workGroupInfo_.localMemSize_ = workGroupInfo_.usedLDSSize_ =`
			`akc->workgroup_group_segment_byte_size;`
			`workGroupInfo_.usedSGPRs_ = akc->wavefront_sgpr_count;`
			`workGroupInfo_.usedStackSize_ = 0;`
			`workGroupInfo_.usedVGPRs_ = akc->workitem_vgpr_count;`

			`if (!prog().isNull()) {`
			`workGroupInfo_.availableSGPRs_ = dev().gslCtx()->getNumSGPRsAvailable();`
			`workGroupInfo_.availableVGPRs_ = dev().gslCtx()->getNumVGPRsAvailable();`
			`workGroupInfo_.preferredSizeMultiple_ = dev().getAttribs().wavefrontSize;`
			`workGroupInfo_.wavefrontPerSIMD_ = dev().getAttribs().wavefrontSize;`
			`} else {`
			`workGroupInfo_.availableSGPRs_ = 104;`
			`workGroupInfo_.availableVGPRs_ = 256;`
			`workGroupInfo_.preferredSizeMultiple_ = workGroupInfo_.wavefrontPerSIMD_ = 64;`
			`}`
			`return true;`
initial commit 2014-07-04 16:17:05 -04:00			`}`
P4 to Git Change 1398097 by lmoriche@lmoriche_opencl_dev2 on 2017/04/13 13:01:56 2017-04-13 13:56:38 -04:00			`} // namespace gpu`