diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index c9688408c8..0434a3518c 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -1619,7 +1619,9 @@ void ihipPrintKernelLaunch(const char* kernelName, const grid_launch_parm* lp, // Allows runtime to track some information about the stream. hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm* lp, const char* kernelNameStr, bool lockAcquired) { - stream = ihipSyncAndResolveStream(stream, lockAcquired); + if (stream == nullptr || stream != stream->getCtx()->_defaultStream){ + stream = ihipSyncAndResolveStream(stream, lockAcquired); + } lp->grid_dim.x = grid.x; lp->grid_dim.y = grid.y; lp->grid_dim.z = grid.z; diff --git a/projects/hip/src/hip_module.cpp b/projects/hip/src/hip_module.cpp index a88abba9cb..8c8c841809 100644 --- a/projects/hip/src/hip_module.cpp +++ b/projects/hip/src/hip_module.cpp @@ -140,7 +140,7 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global uint32_t localWorkSizeZ, size_t sharedMemBytes, hipStream_t hStream, void** kernelParams, void** extra, hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0, - void** impCoopParams = 0) { + void** impCoopParams = 0, hc::accelerator_view* coopAV = 0) { using namespace hip_impl; auto ctx = ihipGetTlsDefaultCtx(); @@ -192,7 +192,7 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global if (impCoopParams) { const auto p{static_cast(*impCoopParams)}; // The sixth index is for multi-grid synchronization - kernargs.insert((kernargs.cend() - padSize - HIP_IMPLICIT_KERNARG_SIZE) + 6 * HIP_IMPLICIT_KERNARG_ALIGNMENT, + kernargs.insert((kernargs.cend() - HIP_IMPLICIT_KERNARG_SIZE) + 6 * HIP_IMPLICIT_KERNARG_ALIGNMENT, p, p + HIP_IMPLICIT_KERNARG_ALIGNMENT); } @@ -245,6 +245,10 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global hc::completion_future cf; + if (coopAV) { + lp.av = coopAV; + } + lp.av->dispatch_hsa_kernel(&aql, kernargs.data(), kernargs.size(), (startEvent || stopEvent) ? &cf : nullptr #if (__hcc_workweek__ > 17312) @@ -399,6 +403,82 @@ hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList return result; } +void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size_t* usedLDS) +{ + if (f->_is_code_object_v3) { + const auto header = reinterpret_cast(f->_header); + // GRANULATED_WAVEFRONT_VGPR_COUNT is specified in 0:5 bits of COMPUTE_PGM_RSRC1 + // the granularity for gfx6-gfx9 is max(0, ceil(vgprs_used / 4) - 1) + *usedVGPRS = ((header->compute_pgm_rsrc1 & 0x3F) + 1) << 2; + // GRANULATED_WAVEFRONT_SGPR_COUNT is specified in 6:9 bits of COMPUTE_PGM_RSRC1 + // the granularity for gfx9+ is 2 * max(0, ceil(sgprs_used / 16) - 1) + *usedSGPRS = ((((header->compute_pgm_rsrc1 & 0x3C0) >> 6) >> 1) + 1) << 4; + *usedLDS = header->group_segment_fixed_size; + } + else { + const auto header = f->_header; + // VGPRs granularity is 4 + *usedVGPRS = ((header->workitem_vgpr_count + 3) >> 2) << 2; + // adding 2 to take into account the 2 VCC registers & handle the granularity of 16 + *usedSGPRS = header->wavefront_sgpr_count + 2; + *usedSGPRS = ((*usedSGPRS + 15) >> 4) << 4; + *usedLDS = header->workgroup_group_segment_byte_size; + } +} + +hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor( + TlsData *tls, uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk) +{ + using namespace hip_impl; + + auto ctx = ihipGetTlsDefaultCtx(); + if (ctx == nullptr) { + return hipErrorInvalidDevice; + } + + hipDeviceProp_t prop{}; + ihipGetDeviceProperties(&prop, ihipGetTlsDefaultCtx()->getDevice()->_deviceId); + + prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024; + + size_t usedVGPRS = 0; + size_t usedSGPRS = 0; + size_t usedLDS = 0; + getGprsLdsUsage(f, &usedVGPRS, &usedSGPRS, &usedLDS); + + // Due to SPI and private memory limitations, the max of wavefronts per CU in 32 + size_t wavefrontSize = prop.warpSize; + size_t maxWavefrontsPerCU = min(prop.maxThreadsPerMultiProcessor / wavefrontSize, 32); + + const size_t simdPerCU = 4; + const size_t maxWavesPerSimd = maxWavefrontsPerCU / simdPerCU; + + size_t numWavefronts = (blockSize + wavefrontSize - 1) / wavefrontSize; + + size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / simdPerCU); + size_t vgprs_alu_occupancy = simdPerCU * (usedVGPRS == 0 ? maxWavesPerSimd + : std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS)); + + // Calculate blocks occupancy per CU based on VGPR usage + *numBlocks = vgprs_alu_occupancy / numWavefronts; + + const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800; + size_t sgprs_alu_occupancy = simdPerCU * (usedSGPRS == 0 ? maxWavesPerSimd + : std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS)); + + // Calculate blocks occupancy per CU based on SGPR usage + *numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts)); + + size_t total_used_lds = usedLDS + dynSharedMemPerBlk; + if (total_used_lds != 0) { + // Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds) + size_t lds_occupancy = prop.maxSharedMemoryPerMultiProcessor / total_used_lds; + *numBlocks = std::min(*numBlocks, (uint32_t) lds_occupancy); + } + + return hipSuccess; +} + namespace { // kernel for initializing GWS // nwm1 is the total number of work groups minus 1 @@ -412,13 +492,16 @@ hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDimX, void** kernelParams, unsigned int sharedMemBytes, hipStream_t stream, hip_impl::program_state& ps) { +#if (__hcc_workweek__ >= 20093) hipError_t result; - if ((f == nullptr) || (stream == nullptr) || (kernelParams == nullptr)) { + if (f == nullptr || kernelParams == nullptr) { return hipErrorNotInitialized; } + stream = ihipSyncAndResolveStream(stream); + if (!stream->getDevice()->_props.cooperativeLaunch) { return hipErrorInvalidConfiguration; } @@ -459,28 +542,44 @@ hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim, kd->_kernarg_layout = *reinterpret_cast>*>(kargs.getHandle()); + GET_TLS(); + uint32_t numBlocksPerSm; + result = ihipOccupancyMaxActiveBlocksPerMultiprocessor(tls, &numBlocksPerSm, kd, + stream->getDevice()->_props.warpSize, sharedMemBytes); + if (result != hipSuccess) { + return hipErrorLaunchFailure; + } + int maxActiveBlocks = numBlocksPerSm * stream->getDevice()->_props.multiProcessorCount; + + //check to see if the workload fits on the GPU + if (gridDim.x * gridDim.y * gridDim.z > maxActiveBlocks){ + return hipErrorCooperativeLaunchTooLarge; + } void *gwsKernelParam[1]; // calculate total number of work groups minus 1 for the main kernel uint nwm1 = (gridDim.x * gridDim.y * gridDim.z) - 1; gwsKernelParam[0] = &nwm1; - LockedAccessor_StreamCrit_t streamCrit(stream->criticalData(), false); -#if (__hcc_workweek__ >= 19213) - streamCrit->_av.acquire_locked_hsa_queue(); -#endif + hc::accelerator acc = stream->getDevice()->_acc; + // create a cooperative accelerated view for launching gws and main kernels + hc::accelerator_view coopAV = acc.create_cooperative_view(); - GET_TLS(); - // launch the init_gws kernel to initialize the GWS + // wait for this stream to finish operations + stream->locked_wait(); + + LockedAccessor_StreamCrit_t streamCrit(stream->criticalData(), false); + streamCrit->_av.acquire_locked_hsa_queue(); + coopAV.acquire_locked_hsa_queue(); + + // launch the init_gws kernel to initialize the GWS in the dedicated cooperative queue result = ihipModuleLaunchKernel(tls, gwsKD, 1, 1, 1, 1, 1, 1, - 0, stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true); + 0, stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true, nullptr , &coopAV); if (result != hipSuccess) { stream->criticalData().unlock(); -#if (__hcc_workweek__ >= 19213) stream->criticalData()._av.release_locked_hsa_queue(); -#endif - + coopAV.release_locked_hsa_queue(); return hipErrorLaunchFailure; } @@ -488,60 +587,88 @@ hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim, void* impCoopParams[1]; impCoopParams[0] = &impCoopArg; - // launch the main kernel + // launch the main kernel in the cooperative queue result = ihipModuleLaunchKernel(tls, kd, gridDim.x * blockDimX.x, gridDim.y * blockDimX.y, gridDim.z * blockDimX.z, blockDimX.x, blockDimX.y, blockDimX.z, sharedMemBytes, stream, kernelParams, nullptr, nullptr, - nullptr, 0, true, impCoopParams); + nullptr, 0, true, impCoopParams, &coopAV); stream->criticalData().unlock(); -#if (__hcc_workweek__ >= 19213) stream->criticalData()._av.release_locked_hsa_queue(); -#endif + coopAV.release_locked_hsa_queue(); + + // wait on the dispatch on the dedicated cooperative queue to finish + coopAV.wait(hc::hcWaitModeActive); + return result; +#else + return hipErrorInvalidConfiguration; +#endif + } __attribute__((visibility("default"))) hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices, unsigned int flags, hip_impl::program_state& ps) { +#if (__hcc_workweek__ >= 20093) hipError_t result; if (numDevices > g_deviceCnt || launchParamsList == nullptr || numDevices > MAX_COOPERATIVE_GPUs) { return hipErrorInvalidValue; } + vector streams; + vector deviceIDs; + // check to see if we have valid distinct streams/devices, if cooperative multi device + // launch is supported and if grid/block dimensions are valid for (int i = 0; i < numDevices; ++i) { - if (!launchParamsList[i].stream->getDevice()->_props.cooperativeMultiDeviceLaunch) { + const hipLaunchParams& lp = launchParamsList[i]; + + if (lp.stream == nullptr){ + return hipErrorInvalidResourceHandle; + } + + auto it = find(streams.begin(), streams.end(), lp.stream); + if (it == streams.end()){ + streams.push_back(lp.stream); + } else{ + return hipErrorInvalidDevice; + } + + const ihipDevice_t* currentDevice = lp.stream->getDevice(); + auto it1 = find(deviceIDs.begin(), deviceIDs.end(), currentDevice->_deviceId); + if (it1 == deviceIDs.end()){ + deviceIDs.push_back(currentDevice->_deviceId); + } else { + return hipErrorInvalidDevice; + } + + if (!currentDevice->_props.cooperativeMultiDeviceLaunch) { + return hipErrorInvalidConfiguration; + } + + if (lp.gridDim.x == 0 || lp.gridDim.y == 0 || lp.gridDim.z == 0 || + lp.blockDim.x == 0 || lp.blockDim.y == 0 || lp.blockDim.z == 0){ return hipErrorInvalidConfiguration; } } - hipFunction_t* gwsKds = reinterpret_cast(malloc(sizeof(hipFunction_t) * numDevices)); - hipFunction_t* kds = reinterpret_cast(malloc(sizeof(hipFunction_t) * numDevices)); - - if (kds == nullptr || gwsKds == nullptr) { - return hipErrorNotInitialized; - } + vector gwsKds; + vector kds; + GET_TLS(); // prepare all kernel descriptors for initializing the GWS and the main kernels per device for (int i = 0; i < numDevices; ++i) { const hipLaunchParams& lp = launchParamsList[i]; - if (lp.stream == nullptr) { - free(gwsKds); - free(kds); - return hipErrorNotInitialized; - } - gwsKds[i] = ps.kernel_descriptor(reinterpret_cast(&init_gws), - hip_impl::target_agent(lp.stream)); + gwsKds.push_back(ps.kernel_descriptor(reinterpret_cast(&init_gws), + hip_impl::target_agent(lp.stream))); if (gwsKds[i] == nullptr) { - free(gwsKds); - free(kds); return hipErrorInvalidValue; } hip_impl::kernargs_size_align gwsKargs = ps.get_kernargs_size_align( @@ -550,23 +677,42 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL gwsKargs.getHandle()); - kds[i] = ps.kernel_descriptor(reinterpret_cast(lp.func), - hip_impl::target_agent(lp.stream)); + kds.push_back(ps.kernel_descriptor(reinterpret_cast(lp.func), + hip_impl::target_agent(lp.stream))); if (kds[i] == nullptr) { - free(gwsKds); - free(kds); return hipErrorInvalidValue; } hip_impl::kernargs_size_align kargs = ps.get_kernargs_size_align( reinterpret_cast(lp.func)); kds[i]->_kernarg_layout = *reinterpret_cast>*>( kargs.getHandle()); + + uint32_t numBlocksPerSm; + result = ihipOccupancyMaxActiveBlocksPerMultiprocessor(tls, &numBlocksPerSm, kds[i], + lp.stream->getDevice()->_props.warpSize, lp.sharedMem); + if (result != hipSuccess) { + return hipErrorLaunchFailure; + } + int maxActiveBlocks = numBlocksPerSm * lp.stream->getDevice()->_props.multiProcessorCount; + + //check to see if the workload fits on the GPU + if (lp.gridDim.x * lp.gridDim.y * lp.gridDim.z > maxActiveBlocks){ + return hipErrorCooperativeLaunchTooLarge; + } + } + + vector coopAVs; + + // create cooperative accelerated views for launching gws and main kernels on each device + for (int i = 0; i < numDevices; ++i) { + hc::accelerator acc = launchParamsList[i].stream->getDevice()->_acc; + coopAVs.push_back(acc.create_cooperative_view()); } mg_sync *mg_sync_ptr = 0; - mg_info *mg_info_ptr[MAX_COOPERATIVE_GPUs] = {0}; + vector mg_info_ptr; + - GET_TLS(); result = hip_internal::ihipHostMalloc(tls, (void **)&mg_sync_ptr, sizeof(mg_sync), hipHostMallocDefault); if (result != hipSuccess) { return hipErrorInvalidValue; @@ -576,7 +722,8 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL uint all_sum = 0; for (int i = 0; i < numDevices; ++i) { - result = hip_internal::ihipHostMalloc(tls, (void **)&mg_info_ptr[i], sizeof(mg_info), hipHostMallocDefault); + mg_info *mg_info_temp; + result = hip_internal::ihipHostMalloc(tls, (void **)&mg_info_temp, sizeof(mg_info), hipHostMallocDefault); if (result != hipSuccess) { hip_internal::ihipHostFree(tls, mg_sync_ptr); for (int j = 0; j < i; ++j) { @@ -584,6 +731,7 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL } return hipErrorInvalidValue; } + mg_info_ptr.push_back(mg_info_temp); // calculate the sum of sizes of all grids const hipLaunchParams& lp = launchParamsList[i]; all_sum += lp.blockDim.x * lp.blockDim.y * lp.blockDim.z * @@ -592,10 +740,10 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL // lock all streams before launching the blit kernels for initializing the GWS and main kernels to each device for (int i = 0; i < numDevices; ++i) { + launchParamsList[i].stream->locked_wait(); LockedAccessor_StreamCrit_t streamCrit(launchParamsList[i].stream->criticalData(), false); -#if (__hcc_workweek__ >= 19213) streamCrit->_av.acquire_locked_hsa_queue(); -#endif + coopAVs[i].acquire_locked_hsa_queue(); } // launch the init_gws kernel to initialize the GWS for each device @@ -607,14 +755,13 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL gwsKernelParam[0] = &nwm1; result = ihipModuleLaunchKernel(tls, gwsKds[i], 1, 1, 1, 1, 1, 1, - 0, lp.stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true); + 0, lp.stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true, nullptr, &coopAVs[i]); if (result != hipSuccess) { for (int j = 0; j < numDevices; ++j) { launchParamsList[j].stream->criticalData().unlock(); -#if (__hcc_workweek__ >= 19213) launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue(); -#endif + coopAVs[i].release_locked_hsa_queue(); } hip_internal::ihipHostFree(tls, mg_sync_ptr); for (int j = 0; j < numDevices; ++j) { @@ -660,14 +807,13 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL lp.blockDim.x, lp.blockDim.y, lp.blockDim.z, lp.sharedMem, lp.stream, lp.args, nullptr, nullptr, nullptr, 0, - true, impCoopParams); + true, impCoopParams, &coopAVs[i]); if (result != hipSuccess) { for (int j = 0; j < numDevices; ++j) { launchParamsList[j].stream->criticalData().unlock(); -#if (__hcc_workweek__ >= 19213) launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue(); -#endif + coopAVs[i].release_locked_hsa_queue(); } hip_internal::ihipHostFree(tls, mg_sync_ptr); for (int j = 0; j < numDevices; ++j) { @@ -682,13 +828,14 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL // unlock all streams for (int i = 0; i < numDevices; ++i) { launchParamsList[i].stream->criticalData().unlock(); -#if (__hcc_workweek__ >= 19213) launchParamsList[i].stream->criticalData()._av.release_locked_hsa_queue(); -#endif + coopAVs[i].release_locked_hsa_queue(); } - free(gwsKds); - free(kds); + // wait on the dispatch on cooperative queues on each device to finish + for (int i = 0; i < numDevices; ++i) { + coopAVs[i].wait(hc::hcWaitModeActive); + } hip_internal::ihipHostFree(tls, mg_sync_ptr); for (int j = 0; j < numDevices; ++j) { @@ -696,6 +843,9 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL } return result; +#else + return hipErrorInvalidConfiguration; +#endif } namespace hip_impl { @@ -1264,29 +1414,6 @@ hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const return ihipLogStatus(hipSuccess); } -void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size_t* usedLDS) -{ - if (f->_is_code_object_v3) { - const auto header = reinterpret_cast(f->_header); - // GRANULATED_WAVEFRONT_VGPR_COUNT is specified in 0:5 bits of COMPUTE_PGM_RSRC1 - // the granularity for gfx6-gfx9 is max(0, ceil(vgprs_used / 4) - 1) - *usedVGPRS = ((header->compute_pgm_rsrc1 & 0x3F) + 1) << 2; - // GRANULATED_WAVEFRONT_SGPR_COUNT is specified in 6:9 bits of COMPUTE_PGM_RSRC1 - // the granularity for gfx9+ is 2 * max(0, ceil(sgprs_used / 16) - 1) - *usedSGPRS = ((((header->compute_pgm_rsrc1 & 0x3C0) >> 6) >> 1) + 1) << 4; - *usedLDS = header->group_segment_fixed_size; - } - else { - const auto header = f->_header; - // VGPRs granularity is 4 - *usedVGPRS = ((header->workitem_vgpr_count + 3) >> 2) << 2; - // adding 2 to take into account the 2 VCC registers & handle the granularity of 16 - *usedSGPRS = header->wavefront_sgpr_count + 2; - *usedSGPRS = ((*usedSGPRS + 15) >> 4) << 4; - *usedLDS = header->workgroup_group_segment_byte_size; - } -} - hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, uint32_t* gridSize, uint32_t* blockSize, hipFunction_t f, size_t dynSharedMemPerBlk, uint32_t blockSizeLimit) @@ -1409,59 +1536,6 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit)); } -hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor( - TlsData *tls, uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk) -{ - using namespace hip_impl; - - auto ctx = ihipGetTlsDefaultCtx(); - if (ctx == nullptr) { - return hipErrorInvalidDevice; - } - - hipDeviceProp_t prop{}; - ihipGetDeviceProperties(&prop, ihipGetTlsDefaultCtx()->getDevice()->_deviceId); - - prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024; - - size_t usedVGPRS = 0; - size_t usedSGPRS = 0; - size_t usedLDS = 0; - getGprsLdsUsage(f, &usedVGPRS, &usedSGPRS, &usedLDS); - - // Due to SPI and private memory limitations, the max of wavefronts per CU in 32 - size_t wavefrontSize = prop.warpSize; - size_t maxWavefrontsPerCU = min(prop.maxThreadsPerMultiProcessor / wavefrontSize, 32); - - const size_t simdPerCU = 4; - const size_t maxWavesPerSimd = maxWavefrontsPerCU / simdPerCU; - - size_t numWavefronts = (blockSize + wavefrontSize - 1) / wavefrontSize; - - size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / simdPerCU); - size_t vgprs_alu_occupancy = simdPerCU * (usedVGPRS == 0 ? maxWavesPerSimd - : std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS)); - - // Calculate blocks occupancy per CU based on VGPR usage - *numBlocks = vgprs_alu_occupancy / numWavefronts; - - const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800; - size_t sgprs_alu_occupancy = simdPerCU * (usedSGPRS == 0 ? maxWavesPerSimd - : std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS)); - - // Calculate blocks occupancy per CU based on SGPR usage - *numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts)); - - size_t total_used_lds = usedLDS + dynSharedMemPerBlk; - if (total_used_lds != 0) { - // Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds) - size_t lds_occupancy = prop.maxSharedMemoryPerMultiProcessor / total_used_lds; - *numBlocks = std::min(*numBlocks, (uint32_t) lds_occupancy); - } - - return hipSuccess; -} - hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk) {