rocm-systems/projects/hip/src/hip_module.cpp

/*
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

#include "hip/hip_runtime.h"
#include "hip/hcc_detail/elfio/elfio.hpp"
#include "hip/hcc_detail/hsa_helpers.hpp"
#include "hip/hcc_detail/program_state.hpp"
#include "hip_hcc_internal.h"
#include "hip/hip_ext.h"
#include "program_state.inl"
#include "trace_helper.h"
#include "hc_am.hpp"

#include <hsa/amd_hsa_kernel_code.h>
#include <hsa/hsa.h>
#include <hsa/hsa_ext_amd.h>

#include <algorithm>
#include <cassert>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <fstream>
#include <map>
#include <memory>
#include <mutex>
#include <sstream>
#include <stdexcept>
#include <string>
#include <tuple>
#include <unordered_map>
#include <utility>
#include <vector>
#include "code_object_bundle.inl"
#include "hip_fatbin.h"
// TODO Use Pool APIs from HCC to get memory regions.

using namespace ELFIO;
using namespace std;

// For HIP implicit kernargs.
static const size_t HIP_IMPLICIT_KERNARG_SIZE = 56;
static const size_t HIP_IMPLICIT_KERNARG_ALIGNMENT = 8;

struct amd_kernel_code_v3_t {
  uint32_t group_segment_fixed_size;
  uint32_t private_segment_fixed_size;
  uint8_t reserved0[8];
  int64_t kernel_code_entry_byte_offset;
  uint8_t reserved1[24];
  uint32_t compute_pgm_rsrc1;
  uint32_t compute_pgm_rsrc2;
  uint16_t kernel_code_properties;
  uint8_t reserved2[6];
};

// calculate MD5 checksum
inline std::string checksum(size_t size, const char *source) {
    // FNV-1a hashing, 64-bit version
    const uint64_t FNV_prime = 0x100000001b3;
    const uint64_t FNV_basis = 0xcbf29ce484222325;
    uint64_t hash = FNV_basis;

    const char *str = static_cast<const char *>(source);
    for (auto i = 0; i < size; ++i) {
        hash ^= *str++;
        hash *= FNV_prime;
    }
    return std::to_string(hash);
}

inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
    assert(Align != 0u && "Align can't be 0.");
    Skew %= Align;
    return (Value + Align - 1 - Skew) / Align * Align + Skew;
}


struct ihipKernArgInfo {
    vector<uint32_t> Size;
    vector<uint32_t> Align;
    vector<string> ArgType;
    vector<string> ArgName;
    uint32_t totalSize;
};

map<string, ihipKernArgInfo> kernelArguments;

struct ihipModuleSymbol_t {
    uint64_t _object{};  // The kernel object.
    amd_kernel_code_t const* _header{};
    string _name;  // TODO - review for performance cost.  Name is just used for debug.
    vector<pair<size_t, size_t>> _kernarg_layout{};
    bool _is_code_object_v3{};
};

template <>
string ToString(hipFunction_t v) {
    std::ostringstream ss;
    ss << "0x" << std::hex << v->_object;
    return ss.str();
};

const std::string& FunctionSymbol(const hipFunction_t f) { return f->_name; };

extern hipError_t ihipGetDeviceProperties(hipDeviceProp_t* props, int device);

#define CHECK_HSA(hsaStatus, hipStatus)                                                            \
    if (hsaStatus != HSA_STATUS_SUCCESS) {                                                         \
        return hipStatus;                                                                          \
    }

#define CHECKLOG_HSA(hsaStatus, hipStatus)                                                         \
    if (hsaStatus != HSA_STATUS_SUCCESS) {                                                         \
        return ihipLogStatus(hipStatus);                                                           \
    }

hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t globalWorkSizeX,
                                  uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
                                  uint32_t localWorkSizeX, uint32_t localWorkSizeY,
                                  uint32_t localWorkSizeZ, size_t sharedMemBytes,
                                  hipStream_t hStream, void** kernelParams, void** extra,
                                  hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0,
                                  void** impCoopParams = 0, hc::accelerator_view* coopAV = 0) {
    using namespace hip_impl;

    auto ctx = ihipGetTlsDefaultCtx();
    hipError_t ret = hipSuccess;

    if (ctx == nullptr) {
        ret = hipErrorInvalidDevice;

    } else {
        int deviceId = ctx->getDevice()->_deviceId;
        ihipDevice_t* currentDevice = ihipGetDevice(deviceId);
        hsa_agent_t gpuAgent = (hsa_agent_t)currentDevice->_hsaAgent;

        std::vector<char> kernargs{};
        if (kernelParams) {
            if (extra) return hipErrorInvalidValue;

            for (auto&& x : f->_kernarg_layout) {
                const auto p{static_cast<const char*>(*kernelParams)};

                kernargs.insert(
                    kernargs.cend(),
                    round_up_to_next_multiple_nonnegative(
                        kernargs.size(), x.second) - kernargs.size(),
                    '\0');
                kernargs.insert(kernargs.cend(), p, p + x.first);

                ++kernelParams;
            }
        } else if (extra) {
            if (extra[0] == HIP_LAUNCH_PARAM_BUFFER_POINTER &&
                extra[2] == HIP_LAUNCH_PARAM_BUFFER_SIZE && extra[4] == HIP_LAUNCH_PARAM_END) {
                auto args = (char*)extra[1];
                size_t argSize = *(size_t*)(extra[3]);
                kernargs.insert(kernargs.end(), args, args+argSize);
            } else {
                return hipErrorNotInitialized;
            }

        }
        else if (f->_kernarg_layout.size() != 0) {
            return hipErrorInvalidValue;
        }

        // Insert 56-bytes at the end for implicit kernel arguments and fill with value zero.
        size_t padSize = (~kernargs.size() + 1) & (HIP_IMPLICIT_KERNARG_ALIGNMENT - 1);
        kernargs.insert(kernargs.end(), padSize + HIP_IMPLICIT_KERNARG_SIZE, 0);

        if (impCoopParams) {
            const auto p{static_cast<const char*>(*impCoopParams)};
            // The sixth index is for multi-grid synchronization
            copy(p, p + HIP_IMPLICIT_KERNARG_ALIGNMENT,
                    (kernargs.end() - HIP_IMPLICIT_KERNARG_SIZE) + 6 * HIP_IMPLICIT_KERNARG_ALIGNMENT);
        }

        /*
          Kernel argument preparation.
        */
        grid_launch_parm lp;
        lp.dynamic_group_mem_bytes =
            sharedMemBytes;  // TODO - this should be part of preLaunchKernel.
        hStream = ihipPreLaunchKernel(
            hStream, dim3(globalWorkSizeX/localWorkSizeX, globalWorkSizeY/localWorkSizeY, globalWorkSizeZ/localWorkSizeZ),
            dim3(localWorkSizeX, localWorkSizeY, localWorkSizeZ), &lp, f->_name.c_str(), isStreamLocked);

        hsa_kernel_dispatch_packet_t aql;

        memset(&aql, 0, sizeof(aql));

        // aql.completion_signal._handle = 0;
        // aql.kernarg_address = 0;

        aql.workgroup_size_x = localWorkSizeX;
        aql.workgroup_size_y = localWorkSizeY;
        aql.workgroup_size_z = localWorkSizeZ;
        aql.grid_size_x = globalWorkSizeX;
        aql.grid_size_y = globalWorkSizeY;
        aql.grid_size_z = globalWorkSizeZ;
        if (f->_is_code_object_v3) {
            const auto* header =
                reinterpret_cast<const amd_kernel_code_v3_t*>(f->_header);
            aql.group_segment_size =
                header->group_segment_fixed_size + sharedMemBytes;
            aql.private_segment_size =
                header->private_segment_fixed_size;
        } else {
            aql.group_segment_size =
                f->_header->workgroup_group_segment_byte_size + sharedMemBytes;
            aql.private_segment_size =
                f->_header->workitem_private_segment_byte_size;
        }
        aql.kernel_object = f->_object;
        aql.setup = 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
        aql.header =
            (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE);
        if((flags & 0x1)== 0 ) {
            //in_order
            aql.header |= (1 << HSA_PACKET_HEADER_BARRIER);
        }

        aql.header |= lp.launch_fence;

        hc::completion_future cf;

        if (coopAV) {
            lp.av = coopAV;
        }

        lp.av->dispatch_hsa_kernel(&aql, kernargs.data(), kernargs.size(),
                                   (startEvent || stopEvent) ? &cf : nullptr
#if (__hcc_workweek__ > 17312)
                                   ,
                                   f->_name.c_str()
#endif
        );


        if (startEvent) {
            startEvent->attachToCompletionFuture(&cf, hStream, hipEventTypeStartCommand);
        }
        if (stopEvent) {
            stopEvent->attachToCompletionFuture(&cf, hStream, hipEventTypeStopCommand);
        }

        ihipPostLaunchKernel(f->_name.c_str(), hStream, lp, isStreamLocked);


    }

    return ret;
}

hipError_t hipModuleLaunchKernel(hipFunction_t f, uint32_t gridDimX, uint32_t gridDimY,
                                 uint32_t gridDimZ, uint32_t blockDimX, uint32_t blockDimY,
                                 uint32_t blockDimZ, uint32_t sharedMemBytes, hipStream_t hStream,
                                 void** kernelParams, void** extra) {
    HIP_INIT_API(hipModuleLaunchKernel, f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes,
                 hStream, kernelParams, extra);

    size_t globalWorkSizeX = (size_t)gridDimX * (size_t)blockDimX;
    size_t globalWorkSizeY = (size_t)gridDimY * (size_t)blockDimY;
    size_t globalWorkSizeZ = (size_t)gridDimZ * (size_t)blockDimZ;
    if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX)
    {
        return hipErrorInvalidConfiguration;
    }

    return ihipLogStatus(ihipModuleLaunchKernel(tls,
        f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, blockDimX, blockDimY,
        blockDimZ, sharedMemBytes, hStream, kernelParams, extra, nullptr, nullptr, 0));
}

hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
                                    uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
                                    uint32_t localWorkSizeX, uint32_t localWorkSizeY,
                                    uint32_t localWorkSizeZ, size_t sharedMemBytes,
                                    hipStream_t hStream, void** kernelParams, void** extra,
                                    hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags) {
    HIP_INIT_API(hipExtModuleLaunchKernel, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX,
                 localWorkSizeY, localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra);

    return ihipLogStatus(ihipModuleLaunchKernel(tls,
        f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY,
        localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, flags));
}

hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
                                    uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
                                    uint32_t localWorkSizeX, uint32_t localWorkSizeY,
                                    uint32_t localWorkSizeZ, size_t sharedMemBytes,
                                    hipStream_t hStream, void** kernelParams, void** extra,
                                    hipEvent_t startEvent, hipEvent_t stopEvent) {
    HIP_INIT_API(hipHccModuleLaunchKernel, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX,
                 localWorkSizeY, localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra);

    return ihipLogStatus(ihipModuleLaunchKernel(tls,
        f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY,
        localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, 0));
}

__attribute__((visibility("default")))
hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
                                              int  numDevices, unsigned int  flags, hip_impl::program_state& ps) {
    hipError_t result;

    if ((numDevices > g_deviceCnt) || (launchParamsList == nullptr)) {
        return hipErrorInvalidValue;
    }

    std::vector<hipFunction_t> kds(numDevices,0);

    // prepare all kernel descriptors for each device as all streams will be locked in the next loop
    for (int i = 0; i < numDevices; ++i) {
        const hipLaunchParams& lp = launchParamsList[i];
        if (lp.stream == nullptr) {
            return hipErrorNotInitialized;
        }
        kds[i] = ps.kernel_descriptor(reinterpret_cast<std::uintptr_t>(lp.func),
                hip_impl::target_agent(lp.stream));

        if (kds[i] == nullptr) {
            return hipErrorInvalidValue;
        }
        if (!kds[i]->_kernarg_layout.empty()) continue;

        hip_impl::kernargs_size_align kargs = ps.get_kernargs_size_align(
                reinterpret_cast<std::uintptr_t>(lp.func));
        kds[i]->_kernarg_layout = *reinterpret_cast<const std::vector<std::pair<std::size_t, std::size_t>>*>(
                kargs.getHandle());
    }

    // lock all streams before launching kernels to each device
    for (int i = 0; i < numDevices; ++i) {
        LockedAccessor_StreamCrit_t streamCrit(launchParamsList[i].stream->criticalData(), false);
 #if (__hcc_workweek__ >= 19213)
        streamCrit->_av.acquire_locked_hsa_queue();
 #endif
     }

    GET_TLS();

    size_t globalWorkSizeX = 0, globalWorkSizeY = 0, globalWorkSizeZ = 0;

    // launch kernels for each  device
    for (int i = 0; i < numDevices; ++i) {
        const hipLaunchParams& lp = launchParamsList[i];

        globalWorkSizeX = (size_t)lp.gridDim.x * (size_t)lp.blockDim.x;
        globalWorkSizeY = (size_t)lp.gridDim.y * (size_t)lp.blockDim.y;
        globalWorkSizeZ = (size_t)lp.gridDim.z * (size_t)lp.blockDim.z;

        if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX)
        {
            return hipErrorInvalidConfiguration;
        }

        result = ihipModuleLaunchKernel(tls, kds[i],
                lp.gridDim.x * lp.blockDim.x,
                lp.gridDim.y * lp.blockDim.y,
                lp.gridDim.z * lp.blockDim.z,
                lp.blockDim.x, lp.blockDim.y,
                lp.blockDim.z, lp.sharedMem,
                lp.stream, lp.args, nullptr, nullptr, nullptr, 0,
                true /* stream is already locked above and will be unlocked
                        in the below code after launching kernels on all devices*/);
    }

    // unlock all streams
    for (int i = 0; i < numDevices; ++i) {
        launchParamsList[i].stream->criticalData().unlock();
 #if (__hcc_workweek__ >= 19213)
        launchParamsList[i].stream->criticalData()._av.release_locked_hsa_queue();
 #endif
     }

    return result;
}

__attribute__((visibility("default")))
hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
                                              int  numDevices, unsigned int  flags) {
    HIP_INIT_API(hipExtLaunchMultiKernelMultiDevice, launchParamsList, numDevices, flags);
    auto& ps = hip_impl::get_program_state();
    return ihipExtLaunchMultiKernelMultiDevice(launchParamsList, numDevices, flags, ps);
}

void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size_t* usedLDS)
{
    if (f->_is_code_object_v3) {
        const auto header = reinterpret_cast<const amd_kernel_code_v3_t*>(f->_header);
        // GRANULATED_WAVEFRONT_VGPR_COUNT is specified in 0:5 bits of COMPUTE_PGM_RSRC1
        // the granularity for gfx6-gfx9 is max(0, ceil(vgprs_used / 4) - 1)
        *usedVGPRS = ((header->compute_pgm_rsrc1 & 0x3F) + 1) << 2;
        // GRANULATED_WAVEFRONT_SGPR_COUNT is specified in 6:9 bits of COMPUTE_PGM_RSRC1
        // the granularity for gfx9+ is 2 * max(0, ceil(sgprs_used / 16) - 1)
        *usedSGPRS = ((((header->compute_pgm_rsrc1 & 0x3C0) >> 6) >> 1) + 1) << 4;
        *usedLDS = header->group_segment_fixed_size;
    }
    else {
        const auto header = f->_header;
        // VGPRs granularity is 4
        *usedVGPRS = ((header->workitem_vgpr_count + 3) >> 2) << 2;
        // adding 2 to take into account the 2 VCC registers & handle the granularity of 16
        *usedSGPRS = header->wavefront_sgpr_count + 2;
        *usedSGPRS = ((*usedSGPRS + 15) >> 4) << 4;
        *usedLDS = header->workgroup_group_segment_byte_size;
    }
}

static hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
   TlsData *tls, int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk)
{
    using namespace hip_impl;

    auto ctx = ihipGetTlsDefaultCtx();
    if (ctx == nullptr) {
        return hipErrorInvalidDevice;
    }
    if (numBlocks == nullptr) {
        return hipErrorInvalidValue;
    }

    hipDeviceProp_t prop{};
    ihipGetDeviceProperties(&prop, ihipGetTlsDefaultCtx()->getDevice()->_deviceId);

    if (blockSize > prop.maxThreadsPerBlock) {
        *numBlocks = 0;
        return hipSuccess;
    }

    prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024;

    size_t usedVGPRS = 0;
    size_t usedSGPRS = 0;
    size_t usedLDS = 0;
    getGprsLdsUsage(f, &usedVGPRS, &usedSGPRS, &usedLDS);

    // Due to SPI and private memory limitations, the max of wavefronts per CU in 32
    size_t wavefrontSize = prop.warpSize;
    size_t maxWavefrontsPerCU = min(prop.maxThreadsPerMultiProcessor / wavefrontSize, 32);

    const size_t simdPerCU = 4;
    const size_t maxWavesPerSimd = maxWavefrontsPerCU / simdPerCU;

    size_t numWavefronts = (blockSize + wavefrontSize - 1) / wavefrontSize;

    size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / simdPerCU);
    size_t vgprs_alu_occupancy = simdPerCU * (usedVGPRS == 0 ? maxWavesPerSimd
        : std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS));

    // Calculate blocks occupancy per CU based on VGPR usage
    *numBlocks = vgprs_alu_occupancy / numWavefronts;

    const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800;
    size_t sgprs_alu_occupancy = simdPerCU * (usedSGPRS == 0 ? maxWavesPerSimd
        : std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS));

    // Calculate blocks occupancy per CU based on SGPR usage
    *numBlocks = std::min(*numBlocks, (int) (sgprs_alu_occupancy / numWavefronts));

    size_t total_used_lds = usedLDS + dynSharedMemPerBlk;
    if (total_used_lds != 0) {
      // Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds)
      size_t lds_occupancy = prop.maxSharedMemoryPerMultiProcessor / total_used_lds;
      *numBlocks = std::min(*numBlocks, (int) lds_occupancy);
    }

    return hipSuccess;
}

namespace {
// kernel for initializing GWS
// nwm1 is the total number of work groups minus 1
__global__ void init_gws(uint nwm1) {
    __ockl_gws_init(nwm1, 0);
}
}

hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim,
        dim3 blockDim, void** kernelParams, unsigned int sharedMemBytes,
        hipStream_t stream, hip_impl::program_state& ps) {

#if (__hcc_workweek__ >= 20115)
    hipError_t result;


    if (f == nullptr || kernelParams == nullptr) {
        return hipErrorNotInitialized;
    }

    stream = ihipSyncAndResolveStream(stream);

    if (!stream->getDevice()->_props.cooperativeLaunch ||
        blockDim.x * blockDim.y * blockDim.z > stream->getDevice()->_props.maxThreadsPerBlock) {
        return hipErrorInvalidConfiguration;
    }

    size_t globalWorkSizeX = (size_t)gridDim.x * (size_t)blockDim.x;
    size_t globalWorkSizeY = (size_t)gridDim.y * (size_t)blockDim.y;
    size_t globalWorkSizeZ = (size_t)gridDim.z * (size_t)blockDim.z;
    if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX) {
        return hipErrorInvalidConfiguration;
    }

    // Prepare the kernel descriptor for initializing the GWS
    hipFunction_t gwsKD = ps.kernel_descriptor(
            reinterpret_cast<std::uintptr_t>(&init_gws),
            hip_impl::target_agent(stream));

    if (gwsKD == nullptr) {
        return hipErrorInvalidValue;
    }
    hip_impl::kernargs_size_align gwsKargs = ps.get_kernargs_size_align(
                    reinterpret_cast<std::uintptr_t>(&init_gws));

    gwsKD->_kernarg_layout = *reinterpret_cast<const std::vector<
            std::pair<std::size_t, std::size_t>>*>(gwsKargs.getHandle());

    // Prepare the kernel descriptor for the main kernel
    hipFunction_t kd = ps.kernel_descriptor(
            reinterpret_cast<std::uintptr_t>(f),
            hip_impl::target_agent(stream));
    if (kd == nullptr) {
        return hipErrorInvalidValue;
    }
    hip_impl::kernargs_size_align kargs =
            ps.get_kernargs_size_align(
                    reinterpret_cast<std::uintptr_t>(f));

    kd->_kernarg_layout = *reinterpret_cast<const std::vector<
            std::pair<std::size_t, std::size_t>>*>(kargs.getHandle());

    GET_TLS();
    int numBlocksPerSm = 0;
    result = ihipOccupancyMaxActiveBlocksPerMultiprocessor(tls, &numBlocksPerSm, kd,
                    blockDim.x * blockDim.y * blockDim.z, sharedMemBytes);
    if (result != hipSuccess) {
        return hipErrorLaunchFailure;
    }
    int maxActiveBlocks = numBlocksPerSm * stream->getDevice()->_props.multiProcessorCount;

    //check to see if the workload fits on the GPU
    if (gridDim.x * gridDim.y * gridDim.z > maxActiveBlocks) {
        return hipErrorCooperativeLaunchTooLarge;
    }

    void *gwsKernelParam[1];
    // calculate total number of work groups minus 1 for the main kernel
    uint nwm1 = (gridDim.x * gridDim.y * gridDim.z) - 1;
    gwsKernelParam[0] = &nwm1;

    hc::accelerator acc = stream->getDevice()->_acc;
    // create a cooperative accelerated view for launching gws and main kernels
    hc::accelerator_view coopAV = acc.create_cooperative_view();

    LockedAccessor_StreamCrit_t streamCrit(stream->criticalData(), false);

    // the cooperative queue will wait until this stream completes its operations
    hc::completion_future streamCF;
    if (!streamCrit->_av.get_is_empty()) {
        streamCF = streamCrit->_av.create_marker(hc::accelerator_scope);
        coopAV.create_blocking_marker(streamCF, hc::accelerator_scope);
    }

    streamCrit->_av.acquire_locked_hsa_queue();
    coopAV.acquire_locked_hsa_queue();

    // launch the init_gws kernel to initialize the GWS in the dedicated cooperative queue
    result = ihipModuleLaunchKernel(tls, gwsKD, 1, 1, 1, 1, 1, 1,
             0, stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true, nullptr , &coopAV);

    if (result != hipSuccess) {
        stream->criticalData().unlock();
        stream->criticalData()._av.release_locked_hsa_queue();
        coopAV.release_locked_hsa_queue();
        return hipErrorLaunchFailure;
    }

    size_t impCoopArg = 1;
    void* impCoopParams[1];
    impCoopParams[0] = &impCoopArg;

    // launch the main kernel in the cooperative queue
    result = ihipModuleLaunchKernel(tls, kd,
            gridDim.x * blockDim.x,
            gridDim.y * blockDim.y,
            gridDim.z * blockDim.z,
            blockDim.x, blockDim.y, blockDim.z,
            sharedMemBytes, stream, kernelParams, nullptr, nullptr,
            nullptr, 0, true, impCoopParams, &coopAV);


    coopAV.release_locked_hsa_queue();
    stream->criticalData()._av.release_locked_hsa_queue();

    // this stream will wait until the cooperative queue completes its operations
    hc::completion_future cooperativeCF;
    if (!coopAV.get_is_empty()) {
        cooperativeCF = coopAV.create_marker(hc::accelerator_scope);
        streamCrit->_av.create_blocking_marker(cooperativeCF, hc::accelerator_scope);
    }

    stream->criticalData().unlock();

    return result;
#else
    return hipErrorInvalidConfiguration;
#endif

}

__attribute__((visibility("default")))
hipError_t hipLaunchCooperativeKernel(const void* func, dim3 gridDim,
                                    dim3 blockDim, void** args,
                                    size_t sharedMem, hipStream_t stream,
                                    hip_impl::program_state& ps) {

    // Skipping passing in ps, because the logging function does not like it
    HIP_INIT_API(hipLaunchCooperativeKernel, func, gridDim, blockDim, args,
                 sharedMem, stream);

    return ihipLogStatus(ihipLaunchCooperativeKernel(func, gridDim, blockDim,
                         args, sharedMem, stream, ps));
}


hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
        int  numDevices, unsigned int  flags, hip_impl::program_state& ps) {

#if (__hcc_workweek__ >= 20115)
    hipError_t result;

    if (numDevices > g_deviceCnt || launchParamsList == nullptr || numDevices > MAX_COOPERATIVE_GPUs) {
        return hipErrorInvalidValue;
    }

    vector<hipStream_t> streams;
    vector<uint64_t> deviceIDs;
    // check to see if we have valid distinct streams/devices, if cooperative multi device
    // launch is supported and if grid/block dimensions are valid
    for (int i = 0; i < numDevices; ++i) {
        const hipLaunchParams& lp = launchParamsList[i];

        if (lp.stream == nullptr){
            return hipErrorInvalidResourceHandle;
        }

        if (find(streams.begin(), streams.end(), lp.stream) == streams.end()) {
            streams.push_back(lp.stream);
        } else {
            return hipErrorInvalidDevice;
        }

        const ihipDevice_t* currentDevice = lp.stream->getDevice();
        if (find(deviceIDs.begin(), deviceIDs.end(), currentDevice->_deviceId) == deviceIDs.end()) {
            deviceIDs.push_back(currentDevice->_deviceId);
        } else {
            return hipErrorInvalidDevice;
        }

        if (!currentDevice->_props.cooperativeMultiDeviceLaunch) {
            return hipErrorInvalidConfiguration;
        }

        if (lp.gridDim.x == 0 || lp.gridDim.y == 0 || lp.gridDim.z == 0 ||
            lp.blockDim.x == 0 || lp.blockDim.y == 0 || lp.blockDim.z == 0 ||
            lp.blockDim.x * lp.blockDim.y * lp.blockDim.z > currentDevice->_props.maxThreadsPerBlock){
            return hipErrorInvalidConfiguration;
        }
    }

    vector<hipFunction_t> gwsKds;
    vector<hipFunction_t> kds;

    GET_TLS();
    // prepare all kernel descriptors for initializing the GWS and the main kernels per device
    for (int i = 0; i < numDevices; ++i) {
        const hipLaunchParams& lp = launchParamsList[i];

        gwsKds.push_back(ps.kernel_descriptor(reinterpret_cast<std::uintptr_t>(&init_gws),
                hip_impl::target_agent(lp.stream)));
        if (gwsKds[i] == nullptr) {
            return hipErrorInvalidValue;
        }
        hip_impl::kernargs_size_align gwsKargs = ps.get_kernargs_size_align(
                reinterpret_cast<std::uintptr_t>(&init_gws));
        gwsKds[i]->_kernarg_layout = *reinterpret_cast<const std::vector<std::pair<std::size_t, std::size_t>>*>(
                gwsKargs.getHandle());


        kds.push_back(ps.kernel_descriptor(reinterpret_cast<std::uintptr_t>(lp.func),
                hip_impl::target_agent(lp.stream)));
        if (kds[i] == nullptr) {
            return hipErrorInvalidValue;
        }
        hip_impl::kernargs_size_align kargs = ps.get_kernargs_size_align(
                reinterpret_cast<std::uintptr_t>(lp.func));
        kds[i]->_kernarg_layout = *reinterpret_cast<const std::vector<std::pair<std::size_t, std::size_t>>*>(
                kargs.getHandle());

        int numBlocksPerSm = 0;
        result = ihipOccupancyMaxActiveBlocksPerMultiprocessor(tls, &numBlocksPerSm, kds[i],
                        lp.blockDim.x * lp.blockDim.y * lp.blockDim.z, lp.sharedMem);
        if (result != hipSuccess) {
            return hipErrorLaunchFailure;
        }
        int maxActiveBlocks = numBlocksPerSm * lp.stream->getDevice()->_props.multiProcessorCount;

        //check to see if the workload fits on the GPU
        if (lp.gridDim.x * lp.gridDim.y * lp.gridDim.z > maxActiveBlocks) {
            return hipErrorCooperativeLaunchTooLarge;
        }
    }

    vector<hc::accelerator_view> coopAVs;

    // create cooperative accelerated views for launching gws and main kernels on each device
    for (int i = 0; i < numDevices; ++i) {
        hc::accelerator acc = launchParamsList[i].stream->getDevice()->_acc;
        coopAVs.push_back(acc.create_cooperative_view());
    }

    mg_sync *mg_sync_ptr = 0;
    vector<mg_info *> mg_info_ptr;

    result = hip_internal::ihipHostMalloc(tls, (void **)&mg_sync_ptr, sizeof(mg_sync), hipHostMallocDefault, true);
    if (result != hipSuccess) {
        return hipErrorInvalidValue;
    }
    mg_sync_ptr->w0 = 0;
    mg_sync_ptr->w1 = 0;

    uint all_sum = 0;
    for (int i = 0; i < numDevices; ++i) {
        mg_info *mg_info_temp = nullptr;
        result = hip_internal::ihipHostMalloc(tls, (void **)&mg_info_temp, sizeof(mg_info), hipHostMallocDefault, true);
        if (result != hipSuccess) {
            hip_internal::ihipHostFree(tls, mg_sync_ptr);
            for (int j = 0; j < i; ++j) {
                hip_internal::ihipHostFree(tls, mg_info_ptr[j]);
            }
            return hipErrorInvalidValue;
        }
        mg_info_ptr.push_back(mg_info_temp);
        // calculate the sum of sizes of all grids
        const hipLaunchParams& lp = launchParamsList[i];
        all_sum += lp.blockDim.x * lp.blockDim.y * lp.blockDim.z *
                   lp.gridDim.x  * lp.gridDim.y  * lp.gridDim.z;
    }

    // lock all streams before launching the blit kernels for initializing the GWS and main kernels to each device
    for (int i = 0; i < numDevices; ++i) {
        LockedAccessor_StreamCrit_t streamCrit(launchParamsList[i].stream->criticalData(), false);

        hc::completion_future streamCF;
        if (!streamCrit->_av.get_is_empty()) {
            streamCF = streamCrit->_av.create_marker(hc::accelerator_scope);
            if (flags & hipCooperativeLaunchMultiDeviceNoPreSync) {
                coopAVs[i].create_blocking_marker(streamCF, hc::accelerator_scope);
                streamCrit->_av.acquire_locked_hsa_queue();
                coopAVs[i].acquire_locked_hsa_queue();
            } else {
                for (int j = 0; j < numDevices; ++j) {
                    coopAVs[j].create_blocking_marker(streamCF, hc::accelerator_scope);
                }
            }
        }
    }
    if ((flags & hipCooperativeLaunchMultiDeviceNoPreSync) == 0) {
        for (int i = 0; i < numDevices; ++i) {
            launchParamsList[i].stream->criticalData()._av.acquire_locked_hsa_queue();
            coopAVs[i].acquire_locked_hsa_queue();
        }
    }

    // launch the init_gws kernel to initialize the GWS for each device
    for (int i = 0; i < numDevices; ++i) {
        const hipLaunchParams& lp = launchParamsList[i];

        void *gwsKernelParam[1];
        uint nwm1 = (lp.gridDim.x * lp.gridDim.y * lp.gridDim.z) - 1;
        gwsKernelParam[0] = &nwm1;

        result = ihipModuleLaunchKernel(tls, gwsKds[i], 1, 1, 1, 1, 1, 1,
                0, lp.stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true, nullptr, &coopAVs[i]);

        if (result != hipSuccess) {
            for (int j = 0; j < numDevices; ++j) {
                launchParamsList[j].stream->criticalData().unlock();
                launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue();
                coopAVs[i].release_locked_hsa_queue();
            }
            hip_internal::ihipHostFree(tls, mg_sync_ptr);
            for (int j = 0; j < numDevices; ++j) {
                hip_internal::ihipHostFree(tls, mg_info_ptr[j]);
            }

            return hipErrorLaunchFailure;
        }
    }

    void* impCoopParams[1];
    ulong prev_sum = 0;

    size_t globalWorkSizeX = 0, globalWorkSizeY = 0, globalWorkSizeZ = 0;
    // launch the main kernels for each device
    for (int i = 0; i < numDevices; ++i) {
        const hipLaunchParams& lp = launchParamsList[i];

        //initialize and setup the implicit kernel argument for multi-grid sync
        mg_info_ptr[i]->mgs       = mg_sync_ptr;
        mg_info_ptr[i]->grid_id   = i;
        mg_info_ptr[i]->num_grids = numDevices;
        mg_info_ptr[i]->all_sum   = all_sum;
        mg_info_ptr[i]->prev_sum  = prev_sum;
        prev_sum += lp.blockDim.x * lp.blockDim.y * lp.blockDim.z *
                    lp.gridDim.x  * lp.gridDim.y  * lp.gridDim.z;

        lp.stream->coopMemsTracker.push_back(mg_info_ptr[i]);

        impCoopParams[0] = &mg_info_ptr[i];

        globalWorkSizeX = (size_t)lp.gridDim.x * (size_t)lp.blockDim.x;
        globalWorkSizeY = (size_t)lp.gridDim.y * (size_t)lp.blockDim.y;
        globalWorkSizeZ = (size_t)lp.gridDim.z * (size_t)lp.blockDim.z;
        if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX) {
            return hipErrorInvalidConfiguration;
        }

        result = ihipModuleLaunchKernel(tls, kds[i],
                lp.gridDim.x * lp.blockDim.x,
                lp.gridDim.y * lp.blockDim.y,
                lp.gridDim.z * lp.blockDim.z,
                lp.blockDim.x, lp.blockDim.y,
                lp.blockDim.z, lp.sharedMem,
                lp.stream, lp.args, nullptr, nullptr, nullptr, 0,
                true, impCoopParams, &coopAVs[i]);

        if (result != hipSuccess) {
            for (int j = 0; j < numDevices; ++j) {
                launchParamsList[j].stream->criticalData().unlock();
                launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue();
                coopAVs[i].release_locked_hsa_queue();
            }
            hip_internal::ihipHostFree(tls, mg_sync_ptr);
            for (int j = 0; j < numDevices; ++j) {
                hip_internal::ihipHostFree(tls, mg_info_ptr[j]);
                launchParamsList[j].stream->coopMemsTracker.pop_back();
            }

            return hipErrorLaunchFailure;
        }

    }

    // unlock streams and create blocking markers on them based on the workload
    // on cooperative queues on each device
    for (int i = 0; i < numDevices; ++i) {
        coopAVs[i].release_locked_hsa_queue();
        launchParamsList[i].stream->criticalData()._av.release_locked_hsa_queue();
    }

    for (int i = 0; i < numDevices; ++i) {
        hc::completion_future cooperativeCF;
        if (!coopAVs[i].get_is_empty()) {
            cooperativeCF = coopAVs[i].create_marker(hc::accelerator_scope);
            if (flags & hipCooperativeLaunchMultiDeviceNoPostSync) {
                launchParamsList[i].stream->criticalData()._av.create_blocking_marker(
                        cooperativeCF, hc::accelerator_scope);
                launchParamsList[i].stream->criticalData().unlock();
            } else {
                for (int j = 0; j < numDevices; ++j) {
                    launchParamsList[j].stream->criticalData()._av.create_blocking_marker(
                            cooperativeCF, hc::accelerator_scope);
                }
            }
        }
    }

    if ((flags & hipCooperativeLaunchMultiDeviceNoPostSync) == 0) {
        for (int i = 0; i < numDevices; ++i) {
            launchParamsList[i].stream->criticalData().unlock();
        }
    }

    return result;
#else
    return hipErrorInvalidConfiguration;
#endif
}

__attribute__((visibility("default")))
hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
                                                 int  numDevices,
                                                 unsigned int flags,
                                                 hip_impl::program_state& ps) {

    // Skipping passing in ps, because the logging function does not like it
    HIP_INIT_API(hipLaunchCooperativeKernelMultiDevice, launchParamsList,
                 numDevices, flags);

    return ihipLogStatus(ihipLaunchCooperativeKernelMultiDevice(launchParamsList,
                                                                numDevices,
                                                                flags, ps));
}

namespace hip_impl {
    hsa_executable_t executable_for(hipModule_t hmod) {
        return hmod->executable;
    }

    const char* hash_for(hipModule_t hmod) {
        return hmod->hash.c_str();
    }

    hsa_agent_t this_agent() {
        GET_TLS();
        auto ctx = ihipGetTlsDefaultCtx();

        if (!ctx) throw runtime_error{"No active HIP context."};

        auto device = ctx->getDevice();

        if (!device) throw runtime_error{"No device available for HIP."};

        ihipDevice_t* currentDevice = ihipGetDevice(device->_deviceId);

        if (!currentDevice) throw runtime_error{"No active device for HIP."};

        return currentDevice->_hsaAgent;
    }

    struct Agent_global {
        Agent_global() : name(nullptr), address(nullptr), byte_cnt(0) {}
        Agent_global(const char* name, hipDeviceptr_t address, uint32_t byte_cnt)
            : name(nullptr), address(address), byte_cnt(byte_cnt) {
                if (name)
                    this->name = strdup(name);
        }

        Agent_global& operator=(Agent_global&& t) {
            if (this == &t) return *this;

            if (name) free(name);
            name = t.name;
            address = t.address;
            byte_cnt = t.byte_cnt;

            t.name = nullptr;
            t.address = nullptr;
            t.byte_cnt = 0;

            return *this;
        }

        Agent_global(Agent_global&& t)
            : name(nullptr), address(nullptr), byte_cnt(0) {
                *this = std::move(t);
        }

        // not needed, delete them to prevent bugs
        Agent_global(const Agent_global&) = delete;
        Agent_global& operator=(Agent_global& t) = delete;

        ~Agent_global() { if (name) free(name); }

        char* name;
        hipDeviceptr_t address;
        uint32_t byte_cnt;
    };

    template<typename ForwardIterator>
    std::pair<hipDeviceptr_t, std::size_t> read_global_description(
        ForwardIterator f, ForwardIterator l, const char* name) {
        const auto it = std::find_if(f, l, [=](const Agent_global& x) {
            return strcmp(x.name, name) == 0;
        });

        return it == l ?
            std::make_pair(nullptr, 0u) : std::make_pair(it->address, it->byte_cnt);
    }

    std::vector<Agent_global> read_agent_globals(hsa_agent_t agent,
                                            hsa_executable_t executable);
    class agent_globals_impl {
        private:
            std::pair<
                std::mutex,
                std::unordered_map<
                    std::string, std::vector<Agent_global>>> globals_from_module;

            std::unordered_map<
                hsa_agent_t,
                std::pair<
                    std::once_flag,
                    std::vector<Agent_global>>> globals_from_process;

        public:

            hipError_t read_agent_global_from_module(hipDeviceptr_t* dptr, size_t* bytes,
                    hipModule_t hmod, const char* name) {
                // the key of the map would the hash of code object associated with the
                // hipModule_t instance
                std::string key(hash_for(hmod));

                if (globals_from_module.second.count(key) == 0) {
                    std::lock_guard<std::mutex> lck{globals_from_module.first};

                    if (globals_from_module.second.count(key) == 0) {
                        globals_from_module.second.emplace(
                                key, read_agent_globals(this_agent(), executable_for(hmod)));
                    }
                }

                const auto it0 = globals_from_module.second.find(key);
                if (it0 == globals_from_module.second.cend()) {
                    hip_throw(
                            std::runtime_error{"agent_globals data structure corrupted."});
                }

                std::tie(*dptr, *bytes) = read_global_description(it0->second.cbegin(),
                        it0->second.cend(), name);
                // HACK for SWDEV-173477
                //
                // For code objects with global symbols of length 0, ROCR runtime's fix
                // may not be working correctly. Therefore the
                // result from read_agent_globals() can't be trusted entirely.
                //
                // As a workaround to tame applications which depend on the existence of
                // global symbols with length 0, always return hipSuccess here.
                //
                // This behavior shall be reverted once ROCR runtime has been fixed to
                // address SWDEV-173477 and SWDEV-190701

                //return *dptr ? hipSuccess : hipErrorNotFound;
                return hipSuccess;
            }

            hipError_t read_agent_global_from_process(hipDeviceptr_t* dptr, size_t* bytes,
                    const char* name) {

                auto agent = this_agent();

                std::call_once(globals_from_process[agent].first, [this](hsa_agent_t aa) {
                    std::vector<Agent_global> tmp0;
                    for (auto&& executable : hip_impl::get_program_state().impl->get_executables(aa)) {
                        auto tmp1 = read_agent_globals(aa, executable);
                        tmp0.insert(tmp0.end(), make_move_iterator(tmp1.begin()),
                            make_move_iterator(tmp1.end()));
                    }
                    globals_from_process[aa].second = move(move(tmp0));
                }, agent);

                const auto it = globals_from_process.find(agent);

                if (it == globals_from_process.cend()) return hipErrorNotInitialized;

                std::tie(*dptr, *bytes) = read_global_description(it->second.second.cbegin(),
                        it->second.second.cend(), name);

                return *dptr ? hipSuccess : hipErrorNotFound;
            }

    };

    agent_globals::agent_globals() : impl(new agent_globals_impl()) {
        if (!impl)
            hip_throw(
                std::runtime_error{"Error when constructing agent global data structures."});
    }
    agent_globals::~agent_globals() { delete impl; }

    hipError_t agent_globals::read_agent_global_from_module(hipDeviceptr_t* dptr, size_t* bytes,
                                             hipModule_t hmod, const char* name) {
        return impl->read_agent_global_from_module(dptr, bytes, hmod, name);
    }

    hipError_t agent_globals::read_agent_global_from_process(hipDeviceptr_t* dptr, size_t* bytes,
                                              const char* name) {
        hipError_t result = impl->read_agent_global_from_process(dptr, bytes, name);
        if(result != hipSuccess) {
           // For Clang Compiler + Hcc Rt
           result = ihipGetGlobalVar(dptr, bytes, name);
        }
        return result;
    }

} // Namespace hip_impl.

hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes,
                              hipModule_t hmod, const char* name) {
    HIP_INIT_API(hipModuleGetGlobal, dptr, bytes, hmod, name);
    if (!dptr || !bytes || !hmod) return hipErrorInvalidValue;

    if (!name) return hipErrorNotInitialized;

    return hip_impl::get_agent_globals().read_agent_global_from_module(dptr, bytes, hmod, name);
}

namespace {
inline void track(const hip_impl::Agent_global& x, hsa_agent_t agent) {
    GET_TLS();
    tprintf(DB_MEM, "  add variable '%s' with ptr=%p size=%u to tracker\n", x.name,
            x.address, x.byte_cnt);

    int deviceIndex =0;
    for ( deviceIndex = 0; deviceIndex < g_deviceCnt; deviceIndex++) {
        if(g_allAgents[deviceIndex] == agent)
           break;
    }
    auto device = ihipGetDevice(deviceIndex - 1);
    hc::AmPointerInfo ptr_info(nullptr, x.address, x.address, x.byte_cnt, device->_acc, true,
                               false);
    hc::am_memtracker_add(x.address, ptr_info);
#if USE_APP_PTR_FOR_CTX
    hc::am_memtracker_update(x.address, device->_deviceId, 0u, ihipGetTlsDefaultCtx());
#else
    hc::am_memtracker_update(x.address, device->_deviceId, 0u);
#endif

}

template <typename Container = vector<hip_impl::Agent_global>>
inline hsa_status_t copy_agent_global_variables(hsa_executable_t, hsa_agent_t agent,
                                                hsa_executable_symbol_t x, void* out) {
    using namespace hip_impl;

    assert(out);

    hsa_symbol_kind_t t = {};
    hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &t);

    if (t == HSA_SYMBOL_KIND_VARIABLE) {
        hip_impl::Agent_global tmp(name(x).c_str(), address(x), size(x));
        static_cast<Container*>(out)->push_back(std::move(tmp));

        track(static_cast<Container*>(out)->back(),agent);
    }

    return HSA_STATUS_SUCCESS;
}

inline hsa_status_t remove_agent_global_variables(hsa_executable_t, hsa_agent_t agent,
                                                  hsa_executable_symbol_t x, void* unused) {
    hsa_symbol_kind_t t = {};
    hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &t);

    if (t == HSA_SYMBOL_KIND_VARIABLE) {
        hc::am_memtracker_remove(hip_impl::address(x));
    }

    return HSA_STATUS_SUCCESS;
}

hsa_executable_symbol_t find_kernel_by_name(hsa_executable_t executable, const char* kname,
                                            hsa_agent_t* agent = nullptr) {
    using namespace hip_impl;
    hsa_executable_symbol_t symbol = { 0 };
    hsa_agent_t thisagent = agent ? *agent : this_agent();
    hsa_status_t err = hsa_executable_get_symbol_by_name(executable, kname, &thisagent ,&symbol);
    //TODO check err ?
    return symbol;
}


string read_elf_file_as_string(const void* file) {
    // Precondition: file points to an ELF image that was BITWISE loaded
    //               into process accessible memory, and not one loaded by
    //               the loader. This is because in the latter case
    //               alignment may differ, which will break the size
    //               computation.
    //               the image is Elf64, and matches endianness i.e. it is
    //               Little Endian.
    if (!file) return {};

    auto h = static_cast<const ELFIO::Elf64_Ehdr*>(file);
    auto s = static_cast<const char*>(file);
    // This assumes the common case of SHT being the last part of the ELF.
    auto sz = h->e_shoff + h->e_shentsize * h->e_shnum;

    return string{s, s + sz};
}

string code_object_blob_for_agent(const void* maybe_bundled_code, hsa_agent_t agent) {
    using namespace hip_impl;

    if (!maybe_bundled_code) return {};

    Bundled_code_header tmp{maybe_bundled_code};

    if (!valid(tmp)) return {};

    const auto agent_isa = isa(agent);

    const auto it = find_if(bundles(tmp).cbegin(), bundles(tmp).cend(), [=](const Bundled_code& x) {
        return agent_isa == triple_to_hsa_isa(x.triple);
        ;
    });

    if (it == bundles(tmp).cend()) return {};

    return string{it->blob.cbegin(), it->blob.cend()};
}
} // Unnamed namespace.

namespace hip_impl {
    vector<Agent_global> read_agent_globals(hsa_agent_t agent,
                                            hsa_executable_t executable) {
        vector<Agent_global> r;

        hsa_executable_iterate_agent_symbols(
            executable, agent, copy_agent_global_variables, &r);

        return r;
    }
    void remove_agent_globals_from_tracker(hsa_agent_t agent, hsa_executable_t executable) {
        hsa_executable_iterate_agent_symbols(executable, agent, remove_agent_global_variables, NULL);
    }
} // Namespace hip_impl.

hipError_t hipModuleUnload(hipModule_t hmod) {
    HIP_INIT_API(hipModuleUnload, hmod);

    // TODO - improve this synchronization so it is thread-safe.
    // Currently we want for all inflight activity to complete, but don't prevent another
    // thread from launching new kernels before we finish this operation.
    ihipSynchronize(tls);

    // deleting ihipModule_t does not remove agent globals from hc_am memtracker
    hip_impl::remove_agent_globals_from_tracker(hip_impl::this_agent(), hip_impl::executable_for(hmod));

    delete hmod;  // The ihipModule_t dtor will clean everything up.
    hmod = nullptr;

    return ihipLogStatus(hipSuccess);
}

hipError_t ihipModuleGetFunction(TlsData *tls, hipFunction_t* func, hipModule_t hmod, const char* name,
                                 hsa_agent_t *agent = nullptr) {
    using namespace hip_impl;

    if (!func || !name) return hipErrorInvalidValue;

    auto ctx = ihipGetTlsDefaultCtx();

    if (!ctx) return hipErrorInvalidContext;

    *func = new ihipModuleSymbol_t;

    if (!*func) return hipErrorInvalidValue;

    std::string name_str(name);
    std::string namekd_str(name_str + ".kd");
    bool kernel_by_namekd = false;

    auto kernel = find_kernel_by_name(hmod->executable, name_str.c_str(), agent);

    if (kernel.handle == 0u) {
        kernel_by_namekd = true; //Find kernel by namekd_str
        kernel = find_kernel_by_name(hmod->executable, namekd_str.c_str(), agent);
    }

    if (kernel.handle == 0u) return hipErrorNotFound;

    //For hipModuleLoad(), hmod->kernargs must contain an args with key
    //name_str or namekd_str.
    //For hipLaunchKernelGGL(), hmod->kernargs is empty, thus we need
    //insert hmod->kernargs[name_str]
    auto it = hmod->kernargs.find(name_str); //Look up args from the original name
    if (it == hmod->kernargs.end()) {
        it = hmod->kernargs.find(namekd_str); //Look up args from .kd name
    }

    // TODO: refactor the whole ihipThisThat, which is a mess and yields the
    //       below, due to hipFunction_t being a pointer to ihipModuleSymbol_t.

    func[0][0] = *static_cast<hipFunction_t>(
        Kernel_descriptor{kernel_object(kernel),
        kernel_by_namekd ? namekd_str : name_str,
        it != hmod->kernargs.end() ? it->second : hmod->kernargs[name_str]});

    return hipSuccess;
}

// Get kernel for the current hsa agent.
hipError_t hipModuleGetFunction(hipFunction_t* hfunc, hipModule_t hmod, const char* name) {
    HIP_INIT_API(hipModuleGetFunction, hfunc, hmod, name);
    return ihipLogStatus(ihipModuleGetFunction(tls, hfunc, hmod, name));
}

// Get kernel for the given hsa agent. Internal use only.
hipError_t hipModuleGetFunctionEx(hipFunction_t* hfunc, hipModule_t hmod,
                                  const char* name, hsa_agent_t *agent) {
    HIP_INIT_API(hipModuleGetFunctionEx, hfunc, hmod, name, agent);
    return ihipLogStatus(ihipModuleGetFunction(tls, hfunc, hmod, name, agent));
}

namespace {
const amd_kernel_code_v3_t *header_v3(const ihipModuleSymbol_t& kd) {
  return reinterpret_cast<const amd_kernel_code_v3_t*>(kd._header);
}

hipFuncAttributes make_function_attributes(TlsData *tls, ihipModuleSymbol_t& kd) {
    hipFuncAttributes r{};

    hipDeviceProp_t prop{};
    hipGetDeviceProperties(&prop, ihipGetTlsDefaultCtx()->getDevice()->_deviceId);
    // TODO: at the moment there is no way to query the count of registers
    //       available per CU, therefore we hardcode it to 64 KiRegisters.
    prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024;

    if (kd._is_code_object_v3) {
        r.binaryVersion = 0; // FIXME: should it be the ISA version or code
                             //        object format version?
        r.localSizeBytes = header_v3(kd)->private_segment_fixed_size;
        r.sharedSizeBytes = header_v3(kd)->group_segment_fixed_size;
    } else {
        r.localSizeBytes = kd._header->workitem_private_segment_byte_size;
        r.sharedSizeBytes = kd._header->workgroup_group_segment_byte_size;
        r.binaryVersion =
            kd._header->amd_machine_version_major * 10 +
            kd._header->amd_machine_version_minor;
    }
    r.maxDynamicSharedSizeBytes = prop.sharedMemPerBlock - r.sharedSizeBytes;

    size_t usedVGPRS = 0;
    size_t usedSGPRS = 0;
    size_t usedLDS = 0;
    getGprsLdsUsage(&kd, &usedVGPRS, &usedSGPRS, &usedLDS);

    r.numRegs = usedVGPRS;

    size_t wavefrontSize = prop.warpSize;
    size_t maxWavefrontsPerBlock = prop.maxThreadsPerBlock / wavefrontSize;
    size_t maxWavefrontsPerCU = min(prop.maxThreadsPerMultiProcessor / wavefrontSize, 32);
    const size_t numSIMD = 4;
    const size_t maxWavesPerSimd = maxWavefrontsPerCU / numSIMD;
    size_t maxWaves = 0;
    for (int i = 0; i < maxWavefrontsPerBlock; i++) {
        size_t wavefronts = i + 1;

        if (usedVGPRS > 0) {
            size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / numSIMD);
            size_t vgprs_alu_occupancy = numSIMD * std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS);

            // Calculate blocks occupancy per CU based on VGPR usage
            if (vgprs_alu_occupancy < wavefronts)
                break;
        }

        if (usedSGPRS > 0) {
            const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800;
            size_t sgprs_alu_occupancy = numSIMD * ((usedSGPRS == 0) ? maxWavesPerSimd
                : std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS));

            // Calculate blocks occupancy per CU based on SGPR usage
            if (sgprs_alu_occupancy < wavefronts)
                break;
        }
        maxWaves = wavefronts;
    }

    r.maxThreadsPerBlock = maxWaves * wavefrontSize;
    r.ptxVersion = prop.major * 10 + prop.minor; // HIP currently presents itself as PTX 3.0.

    return r;
}
} // Unnamed namespace.

hipError_t hipFuncGetAttributes(hipFuncAttributes* attr, const void* func)
{
    HIP_INIT_API(hipFuncGetAttributes, attr, func);
    using namespace hip_impl;

    if (!attr) return ihipLogStatus(hipErrorInvalidValue);
    if (!func) return ihipLogStatus(hipErrorInvalidDeviceFunction);

    auto agent = this_agent();
    auto kd = get_program_state().kernel_descriptor(reinterpret_cast<uintptr_t>(func), agent);

    if (!kd->_header) throw runtime_error{"Ill-formed Kernel_descriptor."};

    *attr = make_function_attributes(tls, *kd);

    return ihipLogStatus(hipSuccess);
}

hipError_t hipFuncGetAttribute(int* value, hipFunction_attribute attrib, hipFunction_t hfunc)
{
    HIP_INIT_API(hipFuncGetAttribute, value, attrib, hfunc);
    using namespace hip_impl;

    hipError_t retVal = hipSuccess;
    if (!value) return ihipLogStatus(hipErrorInvalidValue);
    hipFuncAttributes attr{};
    attr = make_function_attributes(tls, *hfunc);
    switch(attrib) {
        case HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES:
            *value = (int) attr.sharedSizeBytes;
            break;
        case HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK:
            *value = attr.maxThreadsPerBlock;
            break;
        case HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES:
            *value = (int) attr.constSizeBytes;
            break;
        case HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES:
            *value = (int) attr.localSizeBytes;
            break;
        case HIP_FUNC_ATTRIBUTE_NUM_REGS:
            *value = attr.numRegs;
            break;
        case HIP_FUNC_ATTRIBUTE_PTX_VERSION:
            *value = attr.ptxVersion;
            break;
        case HIP_FUNC_ATTRIBUTE_BINARY_VERSION:
            *value = attr.binaryVersion;
            break;
        case HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA:
            *value = attr.cacheModeCA;
            break;
        case HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES:
            *value = attr.maxDynamicSharedSizeBytes;
            break;
        case HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT:
            *value = attr.preferredShmemCarveout;
            break;
        default:
             retVal = hipErrorInvalidValue;
    }
    return ihipLogStatus(retVal);
}

hipError_t ihipModuleLoadData(TlsData *tls, hipModule_t* module, const void* image) {
    using namespace hip_impl;

    if (!module) return hipErrorInvalidValue;

    *module = new ihipModule_t;

    auto ctx = ihipGetTlsDefaultCtx();
    if (!ctx) return hipErrorInvalidContext;

    // try extracting code object from image as fatbin.
    char name[64] = {};
    hsa_agent_get_info(this_agent(), HSA_AGENT_INFO_NAME, name);
    if (auto *code_obj = __hipExtractCodeObjectFromFatBinary(image, name))
      image = code_obj;

    hsa_executable_create_alt(HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, nullptr,
                              &(*module)->executable);

    auto tmp = code_object_blob_for_agent(image, this_agent());

    auto content = tmp.empty() ? read_elf_file_as_string(image) : tmp;

    (*module)->executable = get_program_state().load_executable(
                                            content.data(), content.size(), (*module)->executable,
                                            this_agent());

    program_state_impl::read_kernarg_metadata(content, (*module)->kernargs);

    // compute the hash of the code object
    (*module)->hash = checksum(content.length(), content.data());

    return (*module)->executable.handle ? hipSuccess : hipErrorUnknown;
}

hipError_t hipModuleLoadData(hipModule_t* module, const void* image) {
    HIP_INIT_API(hipModuleLoadData, module, image);
    return ihipLogStatus(ihipModuleLoadData(tls,module,image));
}

hipError_t hipModuleLoad(hipModule_t* module, const char* fname) {
    HIP_INIT_API(hipModuleLoad, module, fname);

    if (!fname) return ihipLogStatus(hipErrorInvalidValue);

    ifstream file{fname};

    if (!file.is_open()) return ihipLogStatus(hipErrorFileNotFound);

    vector<char> tmp{istreambuf_iterator<char>{file}, istreambuf_iterator<char>{}};

    return ihipLogStatus(ihipModuleLoadData(tls, module, tmp.data()));
}

hipError_t hipModuleLoadDataEx(hipModule_t* module, const void* image, unsigned int numOptions,
                               hipJitOption* options, void** optionValues) {
    HIP_INIT_API(hipModuleLoadDataEx, module, image, numOptions, options, optionValues);
    return ihipLogStatus(ihipModuleLoadData(tls, module, image));
}

hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const char* name) {
    using namespace hip_impl;

    HIP_INIT_API(hipModuleGetTexRef, texRef, hmod, name);

    hipError_t ret = hipErrorNotFound;
    if (!texRef) return ihipLogStatus(hipErrorInvalidValue);

    if (!hmod || !name) return ihipLogStatus(hipErrorNotInitialized);

    auto addr = get_program_state().global_addr_by_name(name);
    if (addr == nullptr) return ihipLogStatus(hipErrorInvalidValue);

    *texRef = reinterpret_cast<textureReference*>(addr);
    return ihipLogStatus(hipSuccess);
}

hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, int* gridSize, int* blockSize,
                                              hipFunction_t f, size_t dynSharedMemPerBlk,
                                              int blockSizeLimit)
{
    using namespace hip_impl;

    auto ctx = ihipGetTlsDefaultCtx();
    if (ctx == nullptr) {
        return hipErrorInvalidDevice;
    }

    hipDeviceProp_t prop{};
    ihipGetDeviceProperties(&prop, ihipGetTlsDefaultCtx()->getDevice()->_deviceId);

    prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024;

    size_t usedVGPRS = 0;
    size_t usedSGPRS = 0;
    size_t usedLDS = 0;
    getGprsLdsUsage(f, &usedVGPRS, &usedSGPRS, &usedLDS);

    // try different workgroup sizes to find the maximum potential occupancy
    // based on the usage of VGPRs and LDS
    size_t wavefrontSize = prop.warpSize;
    size_t maxWavefrontsPerBlock = prop.maxThreadsPerBlock / wavefrontSize;

    // Due to SPI and private memory limitations, the max of wavefronts per CU in 32
    size_t maxWavefrontsPerCU = min(prop.maxThreadsPerMultiProcessor / wavefrontSize, 32);

    const size_t numSIMD = 4;
    size_t maxActivWaves = 0;
    size_t maxWavefronts = 0;
    for (int i = 0; i < maxWavefrontsPerBlock; i++) {
        size_t wavefrontsPerWG = i + 1;

        // workgroup per CU is 40 for WG size of 1 wavefront; otherwise it is 16
        size_t maxWorkgroupPerCU = (wavefrontsPerWG == 1) ? 40 : 16;
        size_t maxWavesWGLimited = min(wavefrontsPerWG * maxWorkgroupPerCU, maxWavefrontsPerCU);

        // Compute VGPR limited wavefronts per block
        size_t wavefrontsVGPRS;
        if (usedVGPRS == 0) {
            wavefrontsVGPRS = maxWavesWGLimited;
        }
        else {
            // find how many VGPRs are available for each SIMD
            size_t numVGPRsPerSIMD = (prop.regsPerBlock / wavefrontSize / numSIMD);
            wavefrontsVGPRS = (numVGPRsPerSIMD / usedVGPRS) * numSIMD;
        }

        size_t maxWavesVGPRSLimited = 0;
        if (wavefrontsVGPRS > maxWavesWGLimited) {
            maxWavesVGPRSLimited = maxWavesWGLimited;
        }
        else {
            maxWavesVGPRSLimited = (wavefrontsVGPRS / wavefrontsPerWG) * wavefrontsPerWG;
        }

        // Compute SGPR limited wavefronts per block
        size_t wavefrontsSGPRS;
        if (usedSGPRS == 0) {
            wavefrontsSGPRS = maxWavesWGLimited;
        }
        else {
            const size_t numSGPRsPerSIMD = (prop.gcnArch < 800) ? 512 : 800;
            wavefrontsSGPRS = (numSGPRsPerSIMD / usedSGPRS) * numSIMD;
        }

        size_t maxWavesSGPRSLimited = 0;
        if (wavefrontsSGPRS > maxWavesWGLimited) {
            maxWavesSGPRSLimited = maxWavesWGLimited;
        }
        else {
            maxWavesSGPRSLimited = (wavefrontsSGPRS / wavefrontsPerWG) * wavefrontsPerWG;
        }

        // Compute LDS limited wavefronts per block
        size_t wavefrontsLDS;
        if (usedLDS == 0) {
            wavefrontsLDS = maxWorkgroupPerCU * wavefrontsPerWG;
        }
        else {
            size_t availableSharedMemPerCU = prop.maxSharedMemoryPerMultiProcessor;
            size_t workgroupPerCU = availableSharedMemPerCU / (usedLDS + dynSharedMemPerBlk);
            wavefrontsLDS = min(workgroupPerCU, maxWorkgroupPerCU) * wavefrontsPerWG;
        }

        size_t maxWavesLDSLimited = min(wavefrontsLDS, maxWavefrontsPerCU);

        size_t activeWavefronts = 0;
        size_t tmp_min = (size_t)min(maxWavesLDSLimited, maxWavesWGLimited);
        tmp_min = min(maxWavesSGPRSLimited, tmp_min);
        activeWavefronts = min(maxWavesVGPRSLimited, tmp_min);

        if (maxActivWaves <= activeWavefronts) {
            maxActivWaves = activeWavefronts;
            maxWavefronts = wavefrontsPerWG;
        }
    }

    // determine the grid and block sizes for maximum potential occupancy
    size_t maxThreadsCnt = prop.maxThreadsPerMultiProcessor*prop.multiProcessorCount;
    if (blockSizeLimit > 0) {
       maxThreadsCnt = min(maxThreadsCnt, blockSizeLimit);
    }

    *blockSize = maxWavefronts * wavefrontSize;
    *gridSize = min((maxThreadsCnt + *blockSize - 1) / *blockSize, prop.multiProcessorCount);

    return hipSuccess;
}

hipError_t hipModuleOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
                                             hipFunction_t f, size_t dynSharedMemPerBlk,
                                             int blockSizeLimit)
{
    HIP_INIT_API(hipModuleOccupancyMaxPotentialBlockSize, gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit);

    return ihipLogStatus(ihipOccupancyMaxPotentialBlockSize(tls,
       gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit));
}

hipError_t hipModuleOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* blockSize,
                                             hipFunction_t f, size_t dynSharedMemPerBlk,
                                             int blockSizeLimit, unsigned int  flags)
{
    HIP_INIT_API(hipModuleOccupancyMaxPotentialBlockSizeWithFlags, gridSize, blockSize, f, dynSharedMemPerBlk,
                 blockSizeLimit, flags);
    if(flags != hipOccupancyDefault) return ihipLogStatus(hipErrorNotSupported);
    return ihipLogStatus(ihipOccupancyMaxPotentialBlockSize(tls,
        gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit));
}

hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
   int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk)
{
    HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessor, numBlocks, f, blockSize, dynSharedMemPerBlk);
    auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f),
                                               hip_impl::target_agent(0));
    return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor(
        tls, numBlocks, F, blockSize, dynSharedMemPerBlk));
}

hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(
   int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk)
{
    HIP_INIT_API(hipModuleOccupancyMaxActiveBlocksPerMultiprocessor, numBlocks, f, blockSize, dynSharedMemPerBlk);

    return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor(
        tls, numBlocks, f, blockSize, dynSharedMemPerBlk));
}

hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
   int* numBlocks, const void* f, int  blockSize, size_t dynSharedMemPerBlk,
   unsigned int flags)
{
    HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, numBlocks, f, blockSize, dynSharedMemPerBlk, flags);
    if(flags != hipOccupancyDefault) return ihipLogStatus(hipErrorNotSupported);
    auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f),
                                               hip_impl::target_agent(0));
    return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor(
        tls, numBlocks, F, blockSize, dynSharedMemPerBlk));
}

hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
   int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk,
   unsigned int flags)
{
    HIP_INIT_API(hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, numBlocks, f, blockSize, dynSharedMemPerBlk, flags);
    if(flags != hipOccupancyDefault) return ihipLogStatus(hipErrorNotSupported);
    return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor(
        tls, numBlocks, f, blockSize, dynSharedMemPerBlk));
}

hipError_t hipLaunchKernel(
    const void* func_addr, dim3 numBlocks, dim3 dimBlocks, void** args,
    size_t sharedMemBytes, hipStream_t stream)
{
   HIP_INIT_API(hipLaunchKernel,func_addr,numBlocks,dimBlocks,args,sharedMemBytes,stream);

   hipFunction_t kd = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)func_addr,
                                                           hip_impl::target_agent(stream));

   return hipModuleLaunchKernel(kd, numBlocks.x, numBlocks.y, numBlocks.z,
                          dimBlocks.x, dimBlocks.y, dimBlocks.z, sharedMemBytes,
                          stream, args, nullptr);
}