Implement accurate max block size in hipFuncGetAttributes() (#1676)

This PR takes ensures that the maxThreadsPerBlock returned by hipFuncGetAttributes is both a multiple of the warp size and that the register usage of the maximum block does not exceed the number of available registers.

Fixes #1662
이 커밋은 다음에 포함됨:
jglaser
2020-03-18 01:50:06 -04:00
커밋한 사람 GitHub
부모 58058091ad
커밋 b5e683a35d
+42 -8
파일 보기
@@ -1247,7 +1247,7 @@ const amd_kernel_code_v3_t *header_v3(const ihipModuleSymbol_t& kd) {
return reinterpret_cast<const amd_kernel_code_v3_t*>(kd._header);
}
hipFuncAttributes make_function_attributes(TlsData *tls, const ihipModuleSymbol_t& kd) {
hipFuncAttributes make_function_attributes(TlsData *tls, ihipModuleSymbol_t& kd) {
hipFuncAttributes r{};
hipDeviceProp_t prop{};
@@ -1257,23 +1257,57 @@ hipFuncAttributes make_function_attributes(TlsData *tls, const ihipModuleSymbol_
prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024;
if (kd._is_code_object_v3) {
r.localSizeBytes = header_v3(kd)->private_segment_fixed_size;
r.sharedSizeBytes = header_v3(kd)->group_segment_fixed_size;
r.numRegs = ((header_v3(kd)->compute_pgm_rsrc1 & 0x3F) + 1) << 2;
r.binaryVersion = 0; // FIXME: should it be the ISA version or code
// object format version?
r.localSizeBytes = header_v3(kd)->private_segment_fixed_size;
r.sharedSizeBytes = header_v3(kd)->group_segment_fixed_size;
} else {
r.localSizeBytes = kd._header->workitem_private_segment_byte_size;
r.sharedSizeBytes = kd._header->workgroup_group_segment_byte_size;
r.numRegs = kd._header->workitem_vgpr_count;
r.binaryVersion =
kd._header->amd_machine_version_major * 10 +
kd._header->amd_machine_version_minor;
}
r.maxDynamicSharedSizeBytes = prop.sharedMemPerBlock - r.sharedSizeBytes;
r.maxThreadsPerBlock = r.numRegs ?
std::min(prop.maxThreadsPerBlock, prop.regsPerBlock / r.numRegs) :
prop.maxThreadsPerBlock;
size_t usedVGPRS = 0;
size_t usedSGPRS = 0;
size_t usedLDS = 0;
getGprsLdsUsage(&kd, &usedVGPRS, &usedSGPRS, &usedLDS);
r.numRegs = usedVGPRS;
size_t wavefrontSize = prop.warpSize;
size_t maxWavefrontsPerBlock = prop.maxThreadsPerBlock / wavefrontSize;
size_t maxWavefrontsPerCU = min(prop.maxThreadsPerMultiProcessor / wavefrontSize, 32);
const size_t numSIMD = 4;
const size_t maxWavesPerSimd = maxWavefrontsPerCU / numSIMD;
size_t maxWaves = 0;
for (int i = 0; i < maxWavefrontsPerBlock; i++) {
size_t wavefronts = i + 1;
if (usedVGPRS > 0) {
size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / numSIMD);
size_t vgprs_alu_occupancy = numSIMD * std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS);
// Calculate blocks occupancy per CU based on VGPR usage
if (vgprs_alu_occupancy < wavefronts)
break;
}
if (usedSGPRS > 0) {
const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800;
size_t sgprs_alu_occupancy = numSIMD * ((usedSGPRS == 0) ? maxWavesPerSimd
: std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS));
// Calculate blocks occupancy per CU based on SGPR usage
if (sgprs_alu_occupancy < wavefronts)
break;
}
maxWaves = wavefronts;
}
r.maxThreadsPerBlock = maxWaves * wavefrontSize;
r.ptxVersion = prop.major * 10 + prop.minor; // HIP currently presents itself as PTX 3.0.
return r;