[HIP] Refactor cooperative APIs

[ROCm/hip commit: 4844fbdf0a]
Этот коммит содержится в:
Aryan Salmanpour
2020-03-06 18:30:12 -05:00
родитель 624b6cd45e
Коммит c25dd0ca3d
2 изменённых файлов: 205 добавлений и 129 удалений
+3 -1
Просмотреть файл
@@ -1619,7 +1619,9 @@ void ihipPrintKernelLaunch(const char* kernelName, const grid_launch_parm* lp,
// Allows runtime to track some information about the stream.
hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm* lp,
const char* kernelNameStr, bool lockAcquired) {
stream = ihipSyncAndResolveStream(stream, lockAcquired);
if (stream == nullptr || stream != stream->getCtx()->_defaultStream){
stream = ihipSyncAndResolveStream(stream, lockAcquired);
}
lp->grid_dim.x = grid.x;
lp->grid_dim.y = grid.y;
lp->grid_dim.z = grid.z;
+202 -128
Просмотреть файл
@@ -140,7 +140,7 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global
uint32_t localWorkSizeZ, size_t sharedMemBytes,
hipStream_t hStream, void** kernelParams, void** extra,
hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0,
void** impCoopParams = 0) {
void** impCoopParams = 0, hc::accelerator_view* coopAV = 0) {
using namespace hip_impl;
auto ctx = ihipGetTlsDefaultCtx();
@@ -192,7 +192,7 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global
if (impCoopParams) {
const auto p{static_cast<const char*>(*impCoopParams)};
// The sixth index is for multi-grid synchronization
kernargs.insert((kernargs.cend() - padSize - HIP_IMPLICIT_KERNARG_SIZE) + 6 * HIP_IMPLICIT_KERNARG_ALIGNMENT,
kernargs.insert((kernargs.cend() - HIP_IMPLICIT_KERNARG_SIZE) + 6 * HIP_IMPLICIT_KERNARG_ALIGNMENT,
p, p + HIP_IMPLICIT_KERNARG_ALIGNMENT);
}
@@ -245,6 +245,10 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global
hc::completion_future cf;
if (coopAV) {
lp.av = coopAV;
}
lp.av->dispatch_hsa_kernel(&aql, kernargs.data(), kernargs.size(),
(startEvent || stopEvent) ? &cf : nullptr
#if (__hcc_workweek__ > 17312)
@@ -399,6 +403,82 @@ hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList
return result;
}
void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size_t* usedLDS)
{
if (f->_is_code_object_v3) {
const auto header = reinterpret_cast<const amd_kernel_code_v3_t*>(f->_header);
// GRANULATED_WAVEFRONT_VGPR_COUNT is specified in 0:5 bits of COMPUTE_PGM_RSRC1
// the granularity for gfx6-gfx9 is max(0, ceil(vgprs_used / 4) - 1)
*usedVGPRS = ((header->compute_pgm_rsrc1 & 0x3F) + 1) << 2;
// GRANULATED_WAVEFRONT_SGPR_COUNT is specified in 6:9 bits of COMPUTE_PGM_RSRC1
// the granularity for gfx9+ is 2 * max(0, ceil(sgprs_used / 16) - 1)
*usedSGPRS = ((((header->compute_pgm_rsrc1 & 0x3C0) >> 6) >> 1) + 1) << 4;
*usedLDS = header->group_segment_fixed_size;
}
else {
const auto header = f->_header;
// VGPRs granularity is 4
*usedVGPRS = ((header->workitem_vgpr_count + 3) >> 2) << 2;
// adding 2 to take into account the 2 VCC registers & handle the granularity of 16
*usedSGPRS = header->wavefront_sgpr_count + 2;
*usedSGPRS = ((*usedSGPRS + 15) >> 4) << 4;
*usedLDS = header->workgroup_group_segment_byte_size;
}
}
hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
TlsData *tls, uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
{
using namespace hip_impl;
auto ctx = ihipGetTlsDefaultCtx();
if (ctx == nullptr) {
return hipErrorInvalidDevice;
}
hipDeviceProp_t prop{};
ihipGetDeviceProperties(&prop, ihipGetTlsDefaultCtx()->getDevice()->_deviceId);
prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024;
size_t usedVGPRS = 0;
size_t usedSGPRS = 0;
size_t usedLDS = 0;
getGprsLdsUsage(f, &usedVGPRS, &usedSGPRS, &usedLDS);
// Due to SPI and private memory limitations, the max of wavefronts per CU in 32
size_t wavefrontSize = prop.warpSize;
size_t maxWavefrontsPerCU = min(prop.maxThreadsPerMultiProcessor / wavefrontSize, 32);
const size_t simdPerCU = 4;
const size_t maxWavesPerSimd = maxWavefrontsPerCU / simdPerCU;
size_t numWavefronts = (blockSize + wavefrontSize - 1) / wavefrontSize;
size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / simdPerCU);
size_t vgprs_alu_occupancy = simdPerCU * (usedVGPRS == 0 ? maxWavesPerSimd
: std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS));
// Calculate blocks occupancy per CU based on VGPR usage
*numBlocks = vgprs_alu_occupancy / numWavefronts;
const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800;
size_t sgprs_alu_occupancy = simdPerCU * (usedSGPRS == 0 ? maxWavesPerSimd
: std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS));
// Calculate blocks occupancy per CU based on SGPR usage
*numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts));
size_t total_used_lds = usedLDS + dynSharedMemPerBlk;
if (total_used_lds != 0) {
// Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds)
size_t lds_occupancy = prop.maxSharedMemoryPerMultiProcessor / total_used_lds;
*numBlocks = std::min(*numBlocks, (uint32_t) lds_occupancy);
}
return hipSuccess;
}
namespace {
// kernel for initializing GWS
// nwm1 is the total number of work groups minus 1
@@ -412,13 +492,16 @@ hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim,
dim3 blockDimX, void** kernelParams, unsigned int sharedMemBytes,
hipStream_t stream, hip_impl::program_state& ps) {
#if (__hcc_workweek__ >= 20093)
hipError_t result;
if ((f == nullptr) || (stream == nullptr) || (kernelParams == nullptr)) {
if (f == nullptr || kernelParams == nullptr) {
return hipErrorNotInitialized;
}
stream = ihipSyncAndResolveStream(stream);
if (!stream->getDevice()->_props.cooperativeLaunch) {
return hipErrorInvalidConfiguration;
}
@@ -459,28 +542,44 @@ hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim,
kd->_kernarg_layout = *reinterpret_cast<const std::vector<
std::pair<std::size_t, std::size_t>>*>(kargs.getHandle());
GET_TLS();
uint32_t numBlocksPerSm;
result = ihipOccupancyMaxActiveBlocksPerMultiprocessor(tls, &numBlocksPerSm, kd,
stream->getDevice()->_props.warpSize, sharedMemBytes);
if (result != hipSuccess) {
return hipErrorLaunchFailure;
}
int maxActiveBlocks = numBlocksPerSm * stream->getDevice()->_props.multiProcessorCount;
//check to see if the workload fits on the GPU
if (gridDim.x * gridDim.y * gridDim.z > maxActiveBlocks){
return hipErrorCooperativeLaunchTooLarge;
}
void *gwsKernelParam[1];
// calculate total number of work groups minus 1 for the main kernel
uint nwm1 = (gridDim.x * gridDim.y * gridDim.z) - 1;
gwsKernelParam[0] = &nwm1;
LockedAccessor_StreamCrit_t streamCrit(stream->criticalData(), false);
#if (__hcc_workweek__ >= 19213)
streamCrit->_av.acquire_locked_hsa_queue();
#endif
hc::accelerator acc = stream->getDevice()->_acc;
// create a cooperative accelerated view for launching gws and main kernels
hc::accelerator_view coopAV = acc.create_cooperative_view();
GET_TLS();
// launch the init_gws kernel to initialize the GWS
// wait for this stream to finish operations
stream->locked_wait();
LockedAccessor_StreamCrit_t streamCrit(stream->criticalData(), false);
streamCrit->_av.acquire_locked_hsa_queue();
coopAV.acquire_locked_hsa_queue();
// launch the init_gws kernel to initialize the GWS in the dedicated cooperative queue
result = ihipModuleLaunchKernel(tls, gwsKD, 1, 1, 1, 1, 1, 1,
0, stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true);
0, stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true, nullptr , &coopAV);
if (result != hipSuccess) {
stream->criticalData().unlock();
#if (__hcc_workweek__ >= 19213)
stream->criticalData()._av.release_locked_hsa_queue();
#endif
coopAV.release_locked_hsa_queue();
return hipErrorLaunchFailure;
}
@@ -488,60 +587,88 @@ hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim,
void* impCoopParams[1];
impCoopParams[0] = &impCoopArg;
// launch the main kernel
// launch the main kernel in the cooperative queue
result = ihipModuleLaunchKernel(tls, kd,
gridDim.x * blockDimX.x,
gridDim.y * blockDimX.y,
gridDim.z * blockDimX.z,
blockDimX.x, blockDimX.y, blockDimX.z,
sharedMemBytes, stream, kernelParams, nullptr, nullptr,
nullptr, 0, true, impCoopParams);
nullptr, 0, true, impCoopParams, &coopAV);
stream->criticalData().unlock();
#if (__hcc_workweek__ >= 19213)
stream->criticalData()._av.release_locked_hsa_queue();
#endif
coopAV.release_locked_hsa_queue();
// wait on the dispatch on the dedicated cooperative queue to finish
coopAV.wait(hc::hcWaitModeActive);
return result;
#else
return hipErrorInvalidConfiguration;
#endif
}
__attribute__((visibility("default")))
hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
int numDevices, unsigned int flags, hip_impl::program_state& ps) {
#if (__hcc_workweek__ >= 20093)
hipError_t result;
if (numDevices > g_deviceCnt || launchParamsList == nullptr || numDevices > MAX_COOPERATIVE_GPUs) {
return hipErrorInvalidValue;
}
vector<hipStream_t> streams;
vector<uint64_t> deviceIDs;
// check to see if we have valid distinct streams/devices, if cooperative multi device
// launch is supported and if grid/block dimensions are valid
for (int i = 0; i < numDevices; ++i) {
if (!launchParamsList[i].stream->getDevice()->_props.cooperativeMultiDeviceLaunch) {
const hipLaunchParams& lp = launchParamsList[i];
if (lp.stream == nullptr){
return hipErrorInvalidResourceHandle;
}
auto it = find(streams.begin(), streams.end(), lp.stream);
if (it == streams.end()){
streams.push_back(lp.stream);
} else{
return hipErrorInvalidDevice;
}
const ihipDevice_t* currentDevice = lp.stream->getDevice();
auto it1 = find(deviceIDs.begin(), deviceIDs.end(), currentDevice->_deviceId);
if (it1 == deviceIDs.end()){
deviceIDs.push_back(currentDevice->_deviceId);
} else {
return hipErrorInvalidDevice;
}
if (!currentDevice->_props.cooperativeMultiDeviceLaunch) {
return hipErrorInvalidConfiguration;
}
if (lp.gridDim.x == 0 || lp.gridDim.y == 0 || lp.gridDim.z == 0 ||
lp.blockDim.x == 0 || lp.blockDim.y == 0 || lp.blockDim.z == 0){
return hipErrorInvalidConfiguration;
}
}
hipFunction_t* gwsKds = reinterpret_cast<hipFunction_t*>(malloc(sizeof(hipFunction_t) * numDevices));
hipFunction_t* kds = reinterpret_cast<hipFunction_t*>(malloc(sizeof(hipFunction_t) * numDevices));
if (kds == nullptr || gwsKds == nullptr) {
return hipErrorNotInitialized;
}
vector<hipFunction_t> gwsKds;
vector<hipFunction_t> kds;
GET_TLS();
// prepare all kernel descriptors for initializing the GWS and the main kernels per device
for (int i = 0; i < numDevices; ++i) {
const hipLaunchParams& lp = launchParamsList[i];
if (lp.stream == nullptr) {
free(gwsKds);
free(kds);
return hipErrorNotInitialized;
}
gwsKds[i] = ps.kernel_descriptor(reinterpret_cast<std::uintptr_t>(&init_gws),
hip_impl::target_agent(lp.stream));
gwsKds.push_back(ps.kernel_descriptor(reinterpret_cast<std::uintptr_t>(&init_gws),
hip_impl::target_agent(lp.stream)));
if (gwsKds[i] == nullptr) {
free(gwsKds);
free(kds);
return hipErrorInvalidValue;
}
hip_impl::kernargs_size_align gwsKargs = ps.get_kernargs_size_align(
@@ -550,23 +677,42 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
gwsKargs.getHandle());
kds[i] = ps.kernel_descriptor(reinterpret_cast<std::uintptr_t>(lp.func),
hip_impl::target_agent(lp.stream));
kds.push_back(ps.kernel_descriptor(reinterpret_cast<std::uintptr_t>(lp.func),
hip_impl::target_agent(lp.stream)));
if (kds[i] == nullptr) {
free(gwsKds);
free(kds);
return hipErrorInvalidValue;
}
hip_impl::kernargs_size_align kargs = ps.get_kernargs_size_align(
reinterpret_cast<std::uintptr_t>(lp.func));
kds[i]->_kernarg_layout = *reinterpret_cast<const std::vector<std::pair<std::size_t, std::size_t>>*>(
kargs.getHandle());
uint32_t numBlocksPerSm;
result = ihipOccupancyMaxActiveBlocksPerMultiprocessor(tls, &numBlocksPerSm, kds[i],
lp.stream->getDevice()->_props.warpSize, lp.sharedMem);
if (result != hipSuccess) {
return hipErrorLaunchFailure;
}
int maxActiveBlocks = numBlocksPerSm * lp.stream->getDevice()->_props.multiProcessorCount;
//check to see if the workload fits on the GPU
if (lp.gridDim.x * lp.gridDim.y * lp.gridDim.z > maxActiveBlocks){
return hipErrorCooperativeLaunchTooLarge;
}
}
vector<hc::accelerator_view> coopAVs;
// create cooperative accelerated views for launching gws and main kernels on each device
for (int i = 0; i < numDevices; ++i) {
hc::accelerator acc = launchParamsList[i].stream->getDevice()->_acc;
coopAVs.push_back(acc.create_cooperative_view());
}
mg_sync *mg_sync_ptr = 0;
mg_info *mg_info_ptr[MAX_COOPERATIVE_GPUs] = {0};
vector<mg_info *> mg_info_ptr;
GET_TLS();
result = hip_internal::ihipHostMalloc(tls, (void **)&mg_sync_ptr, sizeof(mg_sync), hipHostMallocDefault);
if (result != hipSuccess) {
return hipErrorInvalidValue;
@@ -576,7 +722,8 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
uint all_sum = 0;
for (int i = 0; i < numDevices; ++i) {
result = hip_internal::ihipHostMalloc(tls, (void **)&mg_info_ptr[i], sizeof(mg_info), hipHostMallocDefault);
mg_info *mg_info_temp;
result = hip_internal::ihipHostMalloc(tls, (void **)&mg_info_temp, sizeof(mg_info), hipHostMallocDefault);
if (result != hipSuccess) {
hip_internal::ihipHostFree(tls, mg_sync_ptr);
for (int j = 0; j < i; ++j) {
@@ -584,6 +731,7 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
}
return hipErrorInvalidValue;
}
mg_info_ptr.push_back(mg_info_temp);
// calculate the sum of sizes of all grids
const hipLaunchParams& lp = launchParamsList[i];
all_sum += lp.blockDim.x * lp.blockDim.y * lp.blockDim.z *
@@ -592,10 +740,10 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
// lock all streams before launching the blit kernels for initializing the GWS and main kernels to each device
for (int i = 0; i < numDevices; ++i) {
launchParamsList[i].stream->locked_wait();
LockedAccessor_StreamCrit_t streamCrit(launchParamsList[i].stream->criticalData(), false);
#if (__hcc_workweek__ >= 19213)
streamCrit->_av.acquire_locked_hsa_queue();
#endif
coopAVs[i].acquire_locked_hsa_queue();
}
// launch the init_gws kernel to initialize the GWS for each device
@@ -607,14 +755,13 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
gwsKernelParam[0] = &nwm1;
result = ihipModuleLaunchKernel(tls, gwsKds[i], 1, 1, 1, 1, 1, 1,
0, lp.stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true);
0, lp.stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true, nullptr, &coopAVs[i]);
if (result != hipSuccess) {
for (int j = 0; j < numDevices; ++j) {
launchParamsList[j].stream->criticalData().unlock();
#if (__hcc_workweek__ >= 19213)
launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue();
#endif
coopAVs[i].release_locked_hsa_queue();
}
hip_internal::ihipHostFree(tls, mg_sync_ptr);
for (int j = 0; j < numDevices; ++j) {
@@ -660,14 +807,13 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
lp.blockDim.x, lp.blockDim.y,
lp.blockDim.z, lp.sharedMem,
lp.stream, lp.args, nullptr, nullptr, nullptr, 0,
true, impCoopParams);
true, impCoopParams, &coopAVs[i]);
if (result != hipSuccess) {
for (int j = 0; j < numDevices; ++j) {
launchParamsList[j].stream->criticalData().unlock();
#if (__hcc_workweek__ >= 19213)
launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue();
#endif
coopAVs[i].release_locked_hsa_queue();
}
hip_internal::ihipHostFree(tls, mg_sync_ptr);
for (int j = 0; j < numDevices; ++j) {
@@ -682,13 +828,14 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
// unlock all streams
for (int i = 0; i < numDevices; ++i) {
launchParamsList[i].stream->criticalData().unlock();
#if (__hcc_workweek__ >= 19213)
launchParamsList[i].stream->criticalData()._av.release_locked_hsa_queue();
#endif
coopAVs[i].release_locked_hsa_queue();
}
free(gwsKds);
free(kds);
// wait on the dispatch on cooperative queues on each device to finish
for (int i = 0; i < numDevices; ++i) {
coopAVs[i].wait(hc::hcWaitModeActive);
}
hip_internal::ihipHostFree(tls, mg_sync_ptr);
for (int j = 0; j < numDevices; ++j) {
@@ -696,6 +843,9 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
}
return result;
#else
return hipErrorInvalidConfiguration;
#endif
}
namespace hip_impl {
@@ -1264,29 +1414,6 @@ hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const
return ihipLogStatus(hipSuccess);
}
void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size_t* usedLDS)
{
if (f->_is_code_object_v3) {
const auto header = reinterpret_cast<const amd_kernel_code_v3_t*>(f->_header);
// GRANULATED_WAVEFRONT_VGPR_COUNT is specified in 0:5 bits of COMPUTE_PGM_RSRC1
// the granularity for gfx6-gfx9 is max(0, ceil(vgprs_used / 4) - 1)
*usedVGPRS = ((header->compute_pgm_rsrc1 & 0x3F) + 1) << 2;
// GRANULATED_WAVEFRONT_SGPR_COUNT is specified in 6:9 bits of COMPUTE_PGM_RSRC1
// the granularity for gfx9+ is 2 * max(0, ceil(sgprs_used / 16) - 1)
*usedSGPRS = ((((header->compute_pgm_rsrc1 & 0x3C0) >> 6) >> 1) + 1) << 4;
*usedLDS = header->group_segment_fixed_size;
}
else {
const auto header = f->_header;
// VGPRs granularity is 4
*usedVGPRS = ((header->workitem_vgpr_count + 3) >> 2) << 2;
// adding 2 to take into account the 2 VCC registers & handle the granularity of 16
*usedSGPRS = header->wavefront_sgpr_count + 2;
*usedSGPRS = ((*usedSGPRS + 15) >> 4) << 4;
*usedLDS = header->workgroup_group_segment_byte_size;
}
}
hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, uint32_t* gridSize, uint32_t* blockSize,
hipFunction_t f, size_t dynSharedMemPerBlk,
uint32_t blockSizeLimit)
@@ -1409,59 +1536,6 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block
gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit));
}
hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
TlsData *tls, uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
{
using namespace hip_impl;
auto ctx = ihipGetTlsDefaultCtx();
if (ctx == nullptr) {
return hipErrorInvalidDevice;
}
hipDeviceProp_t prop{};
ihipGetDeviceProperties(&prop, ihipGetTlsDefaultCtx()->getDevice()->_deviceId);
prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024;
size_t usedVGPRS = 0;
size_t usedSGPRS = 0;
size_t usedLDS = 0;
getGprsLdsUsage(f, &usedVGPRS, &usedSGPRS, &usedLDS);
// Due to SPI and private memory limitations, the max of wavefronts per CU in 32
size_t wavefrontSize = prop.warpSize;
size_t maxWavefrontsPerCU = min(prop.maxThreadsPerMultiProcessor / wavefrontSize, 32);
const size_t simdPerCU = 4;
const size_t maxWavesPerSimd = maxWavefrontsPerCU / simdPerCU;
size_t numWavefronts = (blockSize + wavefrontSize - 1) / wavefrontSize;
size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / simdPerCU);
size_t vgprs_alu_occupancy = simdPerCU * (usedVGPRS == 0 ? maxWavesPerSimd
: std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS));
// Calculate blocks occupancy per CU based on VGPR usage
*numBlocks = vgprs_alu_occupancy / numWavefronts;
const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800;
size_t sgprs_alu_occupancy = simdPerCU * (usedSGPRS == 0 ? maxWavesPerSimd
: std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS));
// Calculate blocks occupancy per CU based on SGPR usage
*numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts));
size_t total_used_lds = usedLDS + dynSharedMemPerBlk;
if (total_used_lds != 0) {
// Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds)
size_t lds_occupancy = prop.maxSharedMemoryPerMultiProcessor / total_used_lds;
*numBlocks = std::min(*numBlocks, (uint32_t) lds_occupancy);
}
return hipSuccess;
}
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
{