Implement accurate max block size in hipFuncGetAttributes() (#1676)
This PR takes ensures that the maxThreadsPerBlock returned by hipFuncGetAttributes is both a multiple of the warp size and that the register usage of the maximum block does not exceed the number of available registers. Fixes #1662
이 커밋은 다음에 포함됨:
+42
-8
@@ -1247,7 +1247,7 @@ const amd_kernel_code_v3_t *header_v3(const ihipModuleSymbol_t& kd) {
|
||||
return reinterpret_cast<const amd_kernel_code_v3_t*>(kd._header);
|
||||
}
|
||||
|
||||
hipFuncAttributes make_function_attributes(TlsData *tls, const ihipModuleSymbol_t& kd) {
|
||||
hipFuncAttributes make_function_attributes(TlsData *tls, ihipModuleSymbol_t& kd) {
|
||||
hipFuncAttributes r{};
|
||||
|
||||
hipDeviceProp_t prop{};
|
||||
@@ -1257,23 +1257,57 @@ hipFuncAttributes make_function_attributes(TlsData *tls, const ihipModuleSymbol_
|
||||
prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024;
|
||||
|
||||
if (kd._is_code_object_v3) {
|
||||
r.localSizeBytes = header_v3(kd)->private_segment_fixed_size;
|
||||
r.sharedSizeBytes = header_v3(kd)->group_segment_fixed_size;
|
||||
r.numRegs = ((header_v3(kd)->compute_pgm_rsrc1 & 0x3F) + 1) << 2;
|
||||
r.binaryVersion = 0; // FIXME: should it be the ISA version or code
|
||||
// object format version?
|
||||
r.localSizeBytes = header_v3(kd)->private_segment_fixed_size;
|
||||
r.sharedSizeBytes = header_v3(kd)->group_segment_fixed_size;
|
||||
} else {
|
||||
r.localSizeBytes = kd._header->workitem_private_segment_byte_size;
|
||||
r.sharedSizeBytes = kd._header->workgroup_group_segment_byte_size;
|
||||
r.numRegs = kd._header->workitem_vgpr_count;
|
||||
r.binaryVersion =
|
||||
kd._header->amd_machine_version_major * 10 +
|
||||
kd._header->amd_machine_version_minor;
|
||||
}
|
||||
r.maxDynamicSharedSizeBytes = prop.sharedMemPerBlock - r.sharedSizeBytes;
|
||||
r.maxThreadsPerBlock = r.numRegs ?
|
||||
std::min(prop.maxThreadsPerBlock, prop.regsPerBlock / r.numRegs) :
|
||||
prop.maxThreadsPerBlock;
|
||||
|
||||
size_t usedVGPRS = 0;
|
||||
size_t usedSGPRS = 0;
|
||||
size_t usedLDS = 0;
|
||||
getGprsLdsUsage(&kd, &usedVGPRS, &usedSGPRS, &usedLDS);
|
||||
|
||||
r.numRegs = usedVGPRS;
|
||||
|
||||
size_t wavefrontSize = prop.warpSize;
|
||||
size_t maxWavefrontsPerBlock = prop.maxThreadsPerBlock / wavefrontSize;
|
||||
size_t maxWavefrontsPerCU = min(prop.maxThreadsPerMultiProcessor / wavefrontSize, 32);
|
||||
const size_t numSIMD = 4;
|
||||
const size_t maxWavesPerSimd = maxWavefrontsPerCU / numSIMD;
|
||||
size_t maxWaves = 0;
|
||||
for (int i = 0; i < maxWavefrontsPerBlock; i++) {
|
||||
size_t wavefronts = i + 1;
|
||||
|
||||
if (usedVGPRS > 0) {
|
||||
size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / numSIMD);
|
||||
size_t vgprs_alu_occupancy = numSIMD * std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS);
|
||||
|
||||
// Calculate blocks occupancy per CU based on VGPR usage
|
||||
if (vgprs_alu_occupancy < wavefronts)
|
||||
break;
|
||||
}
|
||||
|
||||
if (usedSGPRS > 0) {
|
||||
const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800;
|
||||
size_t sgprs_alu_occupancy = numSIMD * ((usedSGPRS == 0) ? maxWavesPerSimd
|
||||
: std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS));
|
||||
|
||||
// Calculate blocks occupancy per CU based on SGPR usage
|
||||
if (sgprs_alu_occupancy < wavefronts)
|
||||
break;
|
||||
}
|
||||
maxWaves = wavefronts;
|
||||
}
|
||||
|
||||
r.maxThreadsPerBlock = maxWaves * wavefrontSize;
|
||||
r.ptxVersion = prop.major * 10 + prop.minor; // HIP currently presents itself as PTX 3.0.
|
||||
|
||||
return r;
|
||||
|
||||
새 이슈에서 참조
사용자 차단