[HIP] Refactor cooperative APIs

[ROCm/hip commit: 4844fbdf0a]
2020-03-06 18:30:12 -05:00
@@ -1619,7 +1619,9 @@ void ihipPrintKernelLaunch(const char* kernelName, const grid_launch_parm* lp,
 // Allows runtime to track some information about the stream.
 hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm* lp,
                                const char* kernelNameStr, bool lockAcquired) {
-    stream = ihipSyncAndResolveStream(stream, lockAcquired);
+    if (stream == nullptr || stream != stream->getCtx()->_defaultStream){
+        stream = ihipSyncAndResolveStream(stream, lockAcquired);
+    }
    lp->grid_dim.x = grid.x;
    lp->grid_dim.y = grid.y;
    lp->grid_dim.z = grid.z;
@@ -140,7 +140,7 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global
                                  uint32_t localWorkSizeZ, size_t sharedMemBytes,
                                  hipStream_t hStream, void** kernelParams, void** extra,
                                  hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0,
-                                  void** impCoopParams = 0) {
+                                  void** impCoopParams = 0, hc::accelerator_view* coopAV = 0) {
    using namespace hip_impl;

    auto ctx = ihipGetTlsDefaultCtx();
@@ -192,7 +192,7 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global
        if (impCoopParams) {
            const auto p{static_cast<const char*>(*impCoopParams)};
            // The sixth index is for multi-grid synchronization
-            kernargs.insert((kernargs.cend() - padSize - HIP_IMPLICIT_KERNARG_SIZE) + 6 * HIP_IMPLICIT_KERNARG_ALIGNMENT,
+            kernargs.insert((kernargs.cend() - HIP_IMPLICIT_KERNARG_SIZE) + 6 * HIP_IMPLICIT_KERNARG_ALIGNMENT,
                            p, p + HIP_IMPLICIT_KERNARG_ALIGNMENT);
        }

@@ -245,6 +245,10 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global

        hc::completion_future cf;

+        if (coopAV) {
+            lp.av = coopAV;
+        }
+
        lp.av->dispatch_hsa_kernel(&aql, kernargs.data(), kernargs.size(),
                                   (startEvent || stopEvent) ? &cf : nullptr
 #if (__hcc_workweek__ > 17312)
@@ -399,6 +403,82 @@ hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList
    return result;
 }

+void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size_t* usedLDS)
+{
+    if (f->_is_code_object_v3) {
+        const auto header = reinterpret_cast<const amd_kernel_code_v3_t*>(f->_header);
+        // GRANULATED_WAVEFRONT_VGPR_COUNT is specified in 0:5 bits of COMPUTE_PGM_RSRC1
+        // the granularity for gfx6-gfx9 is max(0, ceil(vgprs_used / 4) - 1)
+        *usedVGPRS = ((header->compute_pgm_rsrc1 & 0x3F) + 1) << 2;
+        // GRANULATED_WAVEFRONT_SGPR_COUNT is specified in 6:9 bits of COMPUTE_PGM_RSRC1
+        // the granularity for gfx9+ is 2 * max(0, ceil(sgprs_used / 16) - 1)
+        *usedSGPRS = ((((header->compute_pgm_rsrc1 & 0x3C0) >> 6) >> 1) + 1) << 4;
+        *usedLDS = header->group_segment_fixed_size;
+    }
+    else {
+        const auto header = f->_header;
+        // VGPRs granularity is 4
+        *usedVGPRS = ((header->workitem_vgpr_count + 3) >> 2) << 2;
+        // adding 2 to take into account the 2 VCC registers & handle the granularity of 16
+        *usedSGPRS = header->wavefront_sgpr_count + 2;
+        *usedSGPRS = ((*usedSGPRS + 15) >> 4) << 4;
+        *usedLDS = header->workgroup_group_segment_byte_size;
+    }
+}
+
+hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
+   TlsData *tls, uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
+{
+    using namespace hip_impl;
+
+    auto ctx = ihipGetTlsDefaultCtx();
+    if (ctx == nullptr) {
+        return hipErrorInvalidDevice;
+    }
+
+    hipDeviceProp_t prop{};
+    ihipGetDeviceProperties(&prop, ihipGetTlsDefaultCtx()->getDevice()->_deviceId);
+
+    prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024;
+
+    size_t usedVGPRS = 0;
+    size_t usedSGPRS = 0;
+    size_t usedLDS = 0;
+    getGprsLdsUsage(f, &usedVGPRS, &usedSGPRS, &usedLDS);
+
+    // Due to SPI and private memory limitations, the max of wavefronts per CU in 32
+    size_t wavefrontSize = prop.warpSize;
+    size_t maxWavefrontsPerCU = min(prop.maxThreadsPerMultiProcessor / wavefrontSize, 32);
+
+    const size_t simdPerCU = 4;
+    const size_t maxWavesPerSimd = maxWavefrontsPerCU / simdPerCU;
+
+    size_t numWavefronts = (blockSize + wavefrontSize - 1) / wavefrontSize;
+
+    size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / simdPerCU);
+    size_t vgprs_alu_occupancy = simdPerCU * (usedVGPRS == 0 ? maxWavesPerSimd
+        : std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS));
+
+    // Calculate blocks occupancy per CU based on VGPR usage
+    *numBlocks = vgprs_alu_occupancy / numWavefronts;
+
+    const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800;
+    size_t sgprs_alu_occupancy = simdPerCU * (usedSGPRS == 0 ? maxWavesPerSimd
+        : std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS));
+
+    // Calculate blocks occupancy per CU based on SGPR usage
+    *numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts));
+
+    size_t total_used_lds = usedLDS + dynSharedMemPerBlk;
+    if (total_used_lds != 0) {
+      // Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds)
+      size_t lds_occupancy = prop.maxSharedMemoryPerMultiProcessor / total_used_lds;
+      *numBlocks = std::min(*numBlocks, (uint32_t) lds_occupancy);
+    }
+
+    return hipSuccess;
+}
+
 namespace {
 // kernel for initializing GWS
 // nwm1 is the total number of work groups minus 1
@@ -412,13 +492,16 @@ hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim,
        dim3 blockDimX, void** kernelParams, unsigned int sharedMemBytes,
        hipStream_t stream, hip_impl::program_state& ps) {

+#if (__hcc_workweek__ >= 20093)
    hipError_t result;


-    if ((f == nullptr) || (stream == nullptr) || (kernelParams == nullptr)) {
+    if (f == nullptr || kernelParams == nullptr) {
        return hipErrorNotInitialized;
    }

+    stream = ihipSyncAndResolveStream(stream);
+
    if (!stream->getDevice()->_props.cooperativeLaunch) {
        return hipErrorInvalidConfiguration;
    }
@@ -459,28 +542,44 @@ hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim,
    kd->_kernarg_layout = *reinterpret_cast<const std::vector<
            std::pair<std::size_t, std::size_t>>*>(kargs.getHandle());

+    GET_TLS();
+    uint32_t numBlocksPerSm;
+    result = ihipOccupancyMaxActiveBlocksPerMultiprocessor(tls, &numBlocksPerSm, kd,
+                    stream->getDevice()->_props.warpSize, sharedMemBytes);
+    if (result != hipSuccess) {
+        return hipErrorLaunchFailure;
+    }
+    int maxActiveBlocks = numBlocksPerSm * stream->getDevice()->_props.multiProcessorCount;
+
+    //check to see if the workload fits on the GPU
+    if (gridDim.x * gridDim.y * gridDim.z > maxActiveBlocks){
+        return hipErrorCooperativeLaunchTooLarge;
+    }

    void *gwsKernelParam[1];
    // calculate total number of work groups minus 1 for the main kernel
    uint nwm1 = (gridDim.x * gridDim.y * gridDim.z) - 1;
    gwsKernelParam[0] = &nwm1;

-    LockedAccessor_StreamCrit_t streamCrit(stream->criticalData(), false);
-#if (__hcc_workweek__ >= 19213)
-    streamCrit->_av.acquire_locked_hsa_queue();
-#endif
+    hc::accelerator acc = stream->getDevice()->_acc;
+    // create a cooperative accelerated view for launching gws and main kernels
+    hc::accelerator_view coopAV = acc.create_cooperative_view();

-    GET_TLS();
-    // launch the init_gws kernel to initialize the GWS
+    // wait for this stream to finish operations
+    stream->locked_wait();
+
+    LockedAccessor_StreamCrit_t streamCrit(stream->criticalData(), false);
+    streamCrit->_av.acquire_locked_hsa_queue();
+    coopAV.acquire_locked_hsa_queue();
+
+    // launch the init_gws kernel to initialize the GWS in the dedicated cooperative queue
    result = ihipModuleLaunchKernel(tls, gwsKD, 1, 1, 1, 1, 1, 1,
-             0, stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true);
+             0, stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true, nullptr , &coopAV);

    if (result != hipSuccess) {
        stream->criticalData().unlock();
-#if (__hcc_workweek__ >= 19213)
        stream->criticalData()._av.release_locked_hsa_queue();
-#endif
-
+        coopAV.release_locked_hsa_queue();
        return hipErrorLaunchFailure;
    }

@@ -488,60 +587,88 @@ hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim,
    void* impCoopParams[1];
    impCoopParams[0] = &impCoopArg;

-    // launch the main kernel
+    // launch the main kernel in the cooperative queue
    result = ihipModuleLaunchKernel(tls, kd,
            gridDim.x * blockDimX.x,
            gridDim.y * blockDimX.y,
            gridDim.z * blockDimX.z,
            blockDimX.x, blockDimX.y, blockDimX.z,
            sharedMemBytes, stream, kernelParams, nullptr, nullptr,
-            nullptr, 0, true, impCoopParams);
+            nullptr, 0, true, impCoopParams, &coopAV);

    stream->criticalData().unlock();
-#if (__hcc_workweek__ >= 19213)
    stream->criticalData()._av.release_locked_hsa_queue();
-#endif
+    coopAV.release_locked_hsa_queue();
+
+    // wait on the dispatch on the dedicated cooperative queue to finish
+    coopAV.wait(hc::hcWaitModeActive);
+

    return result;
+#else
+    return hipErrorInvalidConfiguration;
+#endif
+
 }

 __attribute__((visibility("default")))
 hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
        int  numDevices, unsigned int  flags, hip_impl::program_state& ps) {

+#if (__hcc_workweek__ >= 20093)
    hipError_t result;

    if (numDevices > g_deviceCnt || launchParamsList == nullptr || numDevices > MAX_COOPERATIVE_GPUs) {
        return hipErrorInvalidValue;
    }

+    vector<hipStream_t> streams;
+    vector<uint64_t> deviceIDs;
+    // check to see if we have valid distinct streams/devices, if cooperative multi device
+    // launch is supported and if grid/block dimensions are valid
    for (int i = 0; i < numDevices; ++i) {
-        if (!launchParamsList[i].stream->getDevice()->_props.cooperativeMultiDeviceLaunch) {
+        const hipLaunchParams& lp = launchParamsList[i];
+
+        if (lp.stream == nullptr){
+            return hipErrorInvalidResourceHandle;
+        }
+
+        auto it = find(streams.begin(), streams.end(), lp.stream);
+        if (it == streams.end()){
+            streams.push_back(lp.stream);
+        } else{
+            return hipErrorInvalidDevice;
+        }
+
+        const ihipDevice_t* currentDevice = lp.stream->getDevice();
+        auto it1 = find(deviceIDs.begin(), deviceIDs.end(), currentDevice->_deviceId);
+        if (it1 == deviceIDs.end()){
+            deviceIDs.push_back(currentDevice->_deviceId);
+        } else {
+            return hipErrorInvalidDevice;
+        }
+
+        if (!currentDevice->_props.cooperativeMultiDeviceLaunch) {
+            return hipErrorInvalidConfiguration;
+        }
+
+        if (lp.gridDim.x == 0 || lp.gridDim.y == 0 || lp.gridDim.z == 0 ||
+            lp.blockDim.x == 0 || lp.blockDim.y == 0 || lp.blockDim.z == 0){
            return hipErrorInvalidConfiguration;
        }
    }

-    hipFunction_t* gwsKds = reinterpret_cast<hipFunction_t*>(malloc(sizeof(hipFunction_t) * numDevices));
-    hipFunction_t* kds    = reinterpret_cast<hipFunction_t*>(malloc(sizeof(hipFunction_t) * numDevices));
-
-    if (kds == nullptr || gwsKds == nullptr) {
-        return hipErrorNotInitialized;
-    }
+    vector<hipFunction_t> gwsKds;
+    vector<hipFunction_t> kds;

+    GET_TLS();
    // prepare all kernel descriptors for initializing the GWS and the main kernels per device
    for (int i = 0; i < numDevices; ++i) {
        const hipLaunchParams& lp = launchParamsList[i];
-        if (lp.stream == nullptr) {
-            free(gwsKds);
-            free(kds);
-            return hipErrorNotInitialized;
-        }

-        gwsKds[i] = ps.kernel_descriptor(reinterpret_cast<std::uintptr_t>(&init_gws),
-                hip_impl::target_agent(lp.stream));
+        gwsKds.push_back(ps.kernel_descriptor(reinterpret_cast<std::uintptr_t>(&init_gws),
+                hip_impl::target_agent(lp.stream)));
        if (gwsKds[i] == nullptr) {
-            free(gwsKds);
-            free(kds);
            return hipErrorInvalidValue;
        }
        hip_impl::kernargs_size_align gwsKargs = ps.get_kernargs_size_align(
@@ -550,23 +677,42 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
                gwsKargs.getHandle());


-        kds[i] = ps.kernel_descriptor(reinterpret_cast<std::uintptr_t>(lp.func),
-                hip_impl::target_agent(lp.stream));
+        kds.push_back(ps.kernel_descriptor(reinterpret_cast<std::uintptr_t>(lp.func),
+                hip_impl::target_agent(lp.stream)));
        if (kds[i] == nullptr) {
-            free(gwsKds);
-            free(kds);
            return hipErrorInvalidValue;
        }
        hip_impl::kernargs_size_align kargs = ps.get_kernargs_size_align(
                reinterpret_cast<std::uintptr_t>(lp.func));
        kds[i]->_kernarg_layout = *reinterpret_cast<const std::vector<std::pair<std::size_t, std::size_t>>*>(
                kargs.getHandle());
+
+        uint32_t numBlocksPerSm;
+        result = ihipOccupancyMaxActiveBlocksPerMultiprocessor(tls, &numBlocksPerSm, kds[i],
+                        lp.stream->getDevice()->_props.warpSize, lp.sharedMem);
+        if (result != hipSuccess) {
+            return hipErrorLaunchFailure;
+        }
+        int maxActiveBlocks = numBlocksPerSm * lp.stream->getDevice()->_props.multiProcessorCount;
+
+        //check to see if the workload fits on the GPU
+        if (lp.gridDim.x * lp.gridDim.y * lp.gridDim.z > maxActiveBlocks){
+            return hipErrorCooperativeLaunchTooLarge;
+        }
+    }
+
+    vector<hc::accelerator_view> coopAVs;
+
+    // create cooperative accelerated views for launching gws and main kernels on each device
+    for (int i = 0; i < numDevices; ++i) {
+        hc::accelerator acc = launchParamsList[i].stream->getDevice()->_acc;
+        coopAVs.push_back(acc.create_cooperative_view());
    }

    mg_sync *mg_sync_ptr = 0;
-    mg_info *mg_info_ptr[MAX_COOPERATIVE_GPUs] = {0};
+    vector<mg_info *> mg_info_ptr;
+

-    GET_TLS();
    result = hip_internal::ihipHostMalloc(tls, (void **)&mg_sync_ptr, sizeof(mg_sync), hipHostMallocDefault);
    if (result != hipSuccess) {
        return hipErrorInvalidValue;
@@ -576,7 +722,8 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL

    uint all_sum = 0;
    for (int i = 0; i < numDevices; ++i) {
-        result = hip_internal::ihipHostMalloc(tls, (void **)&mg_info_ptr[i], sizeof(mg_info), hipHostMallocDefault);
+        mg_info *mg_info_temp;
+        result = hip_internal::ihipHostMalloc(tls, (void **)&mg_info_temp, sizeof(mg_info), hipHostMallocDefault);
        if (result != hipSuccess) {
            hip_internal::ihipHostFree(tls, mg_sync_ptr);
            for (int j = 0; j < i; ++j) {
@@ -584,6 +731,7 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
            }
            return hipErrorInvalidValue;
        }
+        mg_info_ptr.push_back(mg_info_temp);
        // calculate the sum of sizes of all grids
        const hipLaunchParams& lp = launchParamsList[i];
        all_sum += lp.blockDim.x * lp.blockDim.y * lp.blockDim.z *
@@ -592,10 +740,10 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL

    // lock all streams before launching the blit kernels for initializing the GWS and main kernels to each device
    for (int i = 0; i < numDevices; ++i) {
+        launchParamsList[i].stream->locked_wait();
        LockedAccessor_StreamCrit_t streamCrit(launchParamsList[i].stream->criticalData(), false);
-#if (__hcc_workweek__ >= 19213)
        streamCrit->_av.acquire_locked_hsa_queue();
-#endif
+        coopAVs[i].acquire_locked_hsa_queue();
    }

    // launch the init_gws kernel to initialize the GWS for each device
@@ -607,14 +755,13 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
        gwsKernelParam[0] = &nwm1;

        result = ihipModuleLaunchKernel(tls, gwsKds[i], 1, 1, 1, 1, 1, 1,
-                0, lp.stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true);
+                0, lp.stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true, nullptr, &coopAVs[i]);

        if (result != hipSuccess) {
            for (int j = 0; j < numDevices; ++j) {
                launchParamsList[j].stream->criticalData().unlock();
-#if (__hcc_workweek__ >= 19213)
                launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue();
-#endif
+                coopAVs[i].release_locked_hsa_queue();
            }
            hip_internal::ihipHostFree(tls, mg_sync_ptr);
            for (int j = 0; j < numDevices; ++j) {
@@ -660,14 +807,13 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
                lp.blockDim.x, lp.blockDim.y,
                lp.blockDim.z, lp.sharedMem,
                lp.stream, lp.args, nullptr, nullptr, nullptr, 0,
-                true, impCoopParams);
+                true, impCoopParams, &coopAVs[i]);

        if (result != hipSuccess) {
            for (int j = 0; j < numDevices; ++j) {
                launchParamsList[j].stream->criticalData().unlock();
-#if (__hcc_workweek__ >= 19213)
                launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue();
-#endif
+                coopAVs[i].release_locked_hsa_queue();
            }
            hip_internal::ihipHostFree(tls, mg_sync_ptr);
            for (int j = 0; j < numDevices; ++j) {
@@ -682,13 +828,14 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
    // unlock all streams
    for (int i = 0; i < numDevices; ++i) {
        launchParamsList[i].stream->criticalData().unlock();
-#if (__hcc_workweek__ >= 19213)
        launchParamsList[i].stream->criticalData()._av.release_locked_hsa_queue();
-#endif
+        coopAVs[i].release_locked_hsa_queue();
    }

-    free(gwsKds);
-    free(kds);
+    // wait on the dispatch on cooperative queues on each device to finish
+    for (int i = 0; i < numDevices; ++i) {
+        coopAVs[i].wait(hc::hcWaitModeActive);
+    }

    hip_internal::ihipHostFree(tls, mg_sync_ptr);
    for (int j = 0; j < numDevices; ++j) {
@@ -696,6 +843,9 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
    }

    return result;
+#else
+    return hipErrorInvalidConfiguration;
+#endif
 }

 namespace hip_impl {
@@ -1264,29 +1414,6 @@ hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const
    return ihipLogStatus(hipSuccess);
 }

-void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size_t* usedLDS)
-{
-    if (f->_is_code_object_v3) {
-        const auto header = reinterpret_cast<const amd_kernel_code_v3_t*>(f->_header);
-        // GRANULATED_WAVEFRONT_VGPR_COUNT is specified in 0:5 bits of COMPUTE_PGM_RSRC1
-        // the granularity for gfx6-gfx9 is max(0, ceil(vgprs_used / 4) - 1)
-        *usedVGPRS = ((header->compute_pgm_rsrc1 & 0x3F) + 1) << 2;
-        // GRANULATED_WAVEFRONT_SGPR_COUNT is specified in 6:9 bits of COMPUTE_PGM_RSRC1
-        // the granularity for gfx9+ is 2 * max(0, ceil(sgprs_used / 16) - 1)
-        *usedSGPRS = ((((header->compute_pgm_rsrc1 & 0x3C0) >> 6) >> 1) + 1) << 4;
-        *usedLDS = header->group_segment_fixed_size;
-    }
-    else {
-        const auto header = f->_header;
-        // VGPRs granularity is 4
-        *usedVGPRS = ((header->workitem_vgpr_count + 3) >> 2) << 2;
-        // adding 2 to take into account the 2 VCC registers & handle the granularity of 16
-        *usedSGPRS = header->wavefront_sgpr_count + 2;
-        *usedSGPRS = ((*usedSGPRS + 15) >> 4) << 4;
-        *usedLDS = header->workgroup_group_segment_byte_size;
-    }
-}
-
 hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, uint32_t* gridSize, uint32_t* blockSize,
                                              hipFunction_t f, size_t dynSharedMemPerBlk,
                                              uint32_t blockSizeLimit)
@@ -1409,59 +1536,6 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block
        gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit));
 }

-hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-   TlsData *tls, uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
-{
-    using namespace hip_impl;
-
-    auto ctx = ihipGetTlsDefaultCtx();
-    if (ctx == nullptr) {
-        return hipErrorInvalidDevice;
-    }
-
-    hipDeviceProp_t prop{};
-    ihipGetDeviceProperties(&prop, ihipGetTlsDefaultCtx()->getDevice()->_deviceId);
-
-    prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024;
-
-    size_t usedVGPRS = 0;
-    size_t usedSGPRS = 0;
-    size_t usedLDS = 0;
-    getGprsLdsUsage(f, &usedVGPRS, &usedSGPRS, &usedLDS);
-
-    // Due to SPI and private memory limitations, the max of wavefronts per CU in 32
-    size_t wavefrontSize = prop.warpSize;
-    size_t maxWavefrontsPerCU = min(prop.maxThreadsPerMultiProcessor / wavefrontSize, 32);
-
-    const size_t simdPerCU = 4;
-    const size_t maxWavesPerSimd = maxWavefrontsPerCU / simdPerCU;
-
-    size_t numWavefronts = (blockSize + wavefrontSize - 1) / wavefrontSize;
-
-    size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / simdPerCU);
-    size_t vgprs_alu_occupancy = simdPerCU * (usedVGPRS == 0 ? maxWavesPerSimd
-        : std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS));
-
-    // Calculate blocks occupancy per CU based on VGPR usage
-    *numBlocks = vgprs_alu_occupancy / numWavefronts;
-
-    const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800;
-    size_t sgprs_alu_occupancy = simdPerCU * (usedSGPRS == 0 ? maxWavesPerSimd
-        : std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS));
-
-    // Calculate blocks occupancy per CU based on SGPR usage
-    *numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts));
-
-    size_t total_used_lds = usedLDS + dynSharedMemPerBlk;
-    if (total_used_lds != 0) {
-      // Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds)
-      size_t lds_occupancy = prop.maxSharedMemoryPerMultiProcessor / total_used_lds;
-      *numBlocks = std::min(*numBlocks, (uint32_t) lds_occupancy);
-    }
-
-    return hipSuccess;
-}
-
 hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
   uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
 {