Added support of hipOccupancyMaxActiveBlocksPerMultiprocessor & hipOc… (#1240)
* Added support of hipOccupancyMaxActiveBlocksPerMultiprocessor & hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags APIs * Taking into account of SGPR usage to determine the max active blocks in hipOccupancyMaxActiveBlocksPerMultiprocessor()
Este commit está contenido en:
+98
-25
@@ -879,6 +879,30 @@ hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const
|
||||
return ihipLogStatus(hipSuccess);
|
||||
}
|
||||
|
||||
void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size_t* usedLDS)
|
||||
{
|
||||
bool is_code_object_v3 = f->_name.find(".kd") != std::string::npos;
|
||||
if (is_code_object_v3) {
|
||||
const auto header = reinterpret_cast<const amd_kernel_code_v3_t*>(f->_header);
|
||||
// GRANULATED_WAVEFRONT_VGPR_COUNT is specified in 0:5 bits of COMPUTE_PGM_RSRC1
|
||||
// the granularity for gfx6-gfx9 is max(0, ceil(vgprs_used / 4) - 1)
|
||||
*usedVGPRS = ((header->compute_pgm_rsrc1 & 0x3F) + 1) << 2;
|
||||
// GRANULATED_WAVEFRONT_SGPR_COUNT is specified in 6:9 bits of COMPUTE_PGM_RSRC1
|
||||
// the granularity for gfx9+ is 2 * max(0, ceil(sgprs_used / 16) - 1)
|
||||
*usedSGPRS = ((((header->compute_pgm_rsrc1 & 0x3C0) >> 6) >> 1) + 1) << 4;
|
||||
*usedLDS = header->group_segment_fixed_size;
|
||||
}
|
||||
else {
|
||||
const auto header = f->_header;
|
||||
// VGPRs granularity is 4
|
||||
*usedVGPRS = ((header->workitem_vgpr_count + 3) >> 2) << 2;
|
||||
// adding 2 to take into account the 2 VCC registers & handle the granularity of 16
|
||||
*usedSGPRS = header->wavefront_sgpr_count + 2;
|
||||
*usedSGPRS = ((*usedSGPRS + 15) >> 4) << 4;
|
||||
*usedLDS = header->workgroup_group_segment_byte_size;
|
||||
}
|
||||
}
|
||||
|
||||
hipError_t ihipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* blockSize,
|
||||
hipFunction_t f, size_t dynSharedMemPerBlk,
|
||||
uint32_t blockSizeLimit)
|
||||
@@ -886,10 +910,8 @@ hipError_t ihipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* bloc
|
||||
using namespace hip_impl;
|
||||
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
hipError_t ret = hipSuccess;
|
||||
|
||||
if (ctx == nullptr) {
|
||||
ret = hipErrorInvalidDevice;
|
||||
return hipErrorInvalidDevice;
|
||||
}
|
||||
|
||||
hipDeviceProp_t prop{};
|
||||
@@ -900,26 +922,7 @@ hipError_t ihipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* bloc
|
||||
size_t usedVGPRS = 0;
|
||||
size_t usedSGPRS = 0;
|
||||
size_t usedLDS = 0;
|
||||
bool is_code_object_v3 = f->_name.find(".kd") != std::string::npos;
|
||||
if (is_code_object_v3) {
|
||||
const auto header = reinterpret_cast<const amd_kernel_code_v3_t*>(f->_header);
|
||||
// GRANULATED_WAVEFRONT_VGPR_COUNT is specified in 0:5 bits of COMPUTE_PGM_RSRC1
|
||||
// the granularity for gfx6-gfx9 is max(0, ceil(vgprs_used / 4) - 1)
|
||||
usedVGPRS = ((header->compute_pgm_rsrc1 & 0x3F) + 1) << 2;
|
||||
// GRANULATED_WAVEFRONT_SGPR_COUNT is specified in 6:9 bits of COMPUTE_PGM_RSRC1
|
||||
// the granularity for gfx9+ is 2 * max(0, ceil(sgprs_used / 16) - 1)
|
||||
usedSGPRS = ((((header->compute_pgm_rsrc1 & 0x3C0) >> 6) >> 1) + 1) << 4;
|
||||
usedLDS = header->group_segment_fixed_size;
|
||||
}
|
||||
else {
|
||||
const auto header = f->_header;
|
||||
// VGPRs granularity is 4
|
||||
usedVGPRS = ((header->workitem_vgpr_count + 3) >> 2) << 2;
|
||||
// adding 2 to take into account the 2 VCC registers & handle the granularity of 16
|
||||
usedSGPRS = header->wavefront_sgpr_count + 2;
|
||||
usedSGPRS = ((usedSGPRS + 15) >> 4) << 4;
|
||||
usedLDS = header->workgroup_group_segment_byte_size;
|
||||
}
|
||||
getGprsLdsUsage(f, &usedVGPRS, &usedSGPRS, &usedLDS);
|
||||
|
||||
// try different workgroup sizes to find the maximum potential occupancy
|
||||
// based on the usage of VGPRs and LDS
|
||||
@@ -1009,10 +1012,9 @@ hipError_t ihipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* bloc
|
||||
*blockSize = maxWavefronts * wavefrontSize;
|
||||
*gridSize = min((maxThreadsCnt + *blockSize - 1) / *blockSize, prop.multiProcessorCount);
|
||||
|
||||
return ret;
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
|
||||
hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* blockSize,
|
||||
hipFunction_t f, size_t dynSharedMemPerBlk,
|
||||
uint32_t blockSizeLimit)
|
||||
@@ -1022,3 +1024,74 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block
|
||||
return ihipLogStatus(ihipOccupancyMaxPotentialBlockSize(
|
||||
gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit));
|
||||
}
|
||||
|
||||
hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
|
||||
{
|
||||
using namespace hip_impl;
|
||||
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
if (ctx == nullptr) {
|
||||
return hipErrorInvalidDevice;
|
||||
}
|
||||
|
||||
hipDeviceProp_t prop{};
|
||||
ihipGetDeviceProperties(&prop, ihipGetTlsDefaultCtx()->getDevice()->_deviceId);
|
||||
|
||||
prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024;
|
||||
|
||||
size_t usedVGPRS = 0;
|
||||
size_t usedSGPRS = 0;
|
||||
size_t usedLDS = 0;
|
||||
getGprsLdsUsage(f, &usedVGPRS, &usedSGPRS, &usedLDS);
|
||||
|
||||
// Due to SPI and private memory limitations, the max of wavefronts per CU in 32
|
||||
size_t wavefrontSize = prop.warpSize;
|
||||
size_t maxWavefrontsPerCU = min(prop.maxThreadsPerMultiProcessor / wavefrontSize, 32);
|
||||
|
||||
const size_t simdPerCU = 4;
|
||||
const size_t maxWavesPerSimd = maxWavefrontsPerCU / simdPerCU;
|
||||
|
||||
size_t numWavefronts = (blockSize + wavefrontSize - 1) / wavefrontSize;
|
||||
|
||||
size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / simdPerCU);
|
||||
size_t vgprs_alu_occupancy = simdPerCU * std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS);
|
||||
|
||||
// Calculate blocks occupancy per CU based on VGPR usage
|
||||
*numBlocks = vgprs_alu_occupancy / numWavefronts;
|
||||
|
||||
const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800;
|
||||
size_t sgprs_alu_occupancy = simdPerCU * ((usedSGPRS == 0) ? maxWavesPerSimd
|
||||
: std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS));
|
||||
|
||||
// Calculate blocks occupancy per CU based on SGPR usage
|
||||
*numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts));
|
||||
|
||||
size_t total_used_lds = usedLDS + dynSharedMemPerBlk;
|
||||
if (total_used_lds != 0) {
|
||||
// Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds)
|
||||
size_t lds_occupancy = prop.maxSharedMemoryPerMultiProcessor / total_used_lds;
|
||||
*numBlocks = std::min(*numBlocks, (uint32_t) lds_occupancy);
|
||||
}
|
||||
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
|
||||
{
|
||||
HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessor, numBlocks, f, blockSize, dynSharedMemPerBlk);
|
||||
|
||||
return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
numBlocks, f, blockSize, dynSharedMemPerBlk));
|
||||
}
|
||||
|
||||
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
|
||||
uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk,
|
||||
unsigned int flags)
|
||||
{
|
||||
HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, numBlocks, f, blockSize, dynSharedMemPerBlk, flags);
|
||||
|
||||
return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
numBlocks, f, blockSize, dynSharedMemPerBlk));
|
||||
}
|
||||
|
||||
Referencia en una nueva incidencia
Block a user