diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index cd785fe336..060e70e519 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -117,6 +117,10 @@ Differences or limitations of HIP APIs as compared to CUDA APIs should be clearl
     - Keyword TODO refers to a note that should be addressed in long-term.  Could be style issue, software architecture, or known bugs.
     - FIXME refers to a short-term bug that needs to be addressed.
 
+- HIP_INIT_API() should be placed at the start of each top-level HIP API.  This function will make sure the HIP runtime is initialized,
+  and also constructs an appropriate API string for tracing and CodeXL marker tracing.  The arguments to HIP_INIT_API should match
+  those of the parent fucntion.  
+
 
 #### Presubmit Testing:
 Before checking in or submitting a pull request, run all Rodinia tests and ensure pass results match starting point:
diff --git a/bin/hipcc b/bin/hipcc
index c51e351382..8fde7f426d 100755
--- a/bin/hipcc
+++ b/bin/hipcc
@@ -105,7 +105,7 @@ if ($HIP_PLATFORM eq "hcc") {
     }
 
     # Satisfy HCC dependencies
-    $HIPLDFLAGS .= " -lc++abi";
+    $HIPLDFLAGS .= " -lc++abi -lsupc++";
     $HIPLDFLAGS .= " -L$HSA_PATH/lib -L$ROCM_PATH/lib -lhsa-runtime64 -lhc_am -lhsakmt";
 
     # Handle ROCm target platform
@@ -151,13 +151,12 @@ if ($HIP_PLATFORM eq "hcc") {
     $HIPCC="$CUDA_PATH/bin/nvcc";
     $HIPCXXFLAGS .= " -I$CUDA_PATH/include";
 
-    $HIPLDFLAGS = "";
+    $HIPLDFLAGS = "-lcuda -lcudart";
 } else {
     printf ("error: unknown HIP_PLATFORM = '$HIP_PLATFORM'");
     exit (-1);
 }
 
-
 # Add paths to common HIP includes:
 $HIPCXXFLAGS .= " -I$HIP_PATH/include" ;
 
@@ -177,7 +176,41 @@ if ($verbose & 0x4) {
     print "hipcc-args: ", join (" ", @ARGV), "\n";
 }
 
+# Handle code object generation
+my $ISACMD="";
+if($HIP_PLATFORM eq "hcc"){
+  $ISACMD .= "$HIP_PATH/bin/hipgenisa.sh ";
+  $ISACMD .= $ROCM_PATH;
+  if($ARGV[0] eq "--gencodeobject"){
+    foreach $isaarg (@ARGV[1..$#ARGV]){
+      $ISACMD .= " ";
+      $ISACMD .= $isaarg;
+    }
+    if ($verbose & 0x1) {
+      print "hipcc-cmd: ", $ISACMD, "\n";
+    }
+    system($ISACMD) and die();
+    exit(0);
+  }
+}
+
+if($HIP_PLATFORM eq "nvcc"){
+  $ISACMD .= "$HIP_PATH/bin/hipcc -ptx ";
+  if($ARGV[0] eq "--gencodeobject"){
+    foreach $isaarg (@ARGV[1..$#ARGV]){
+      $ISACMD .= " ";
+      $ISACMD .= $isaarg;
+    }
+    if ($verbose & 0x1) {
+      print "hipcc-cmd: ", $ISACMD, "\n";
+    }
+    system($ISACMD) and die();
+    exit(0);
+  }
+}
+
 my $toolArgs = "";  # arguments to pass to the hcc or nvcc tool
+
 foreach $arg (@ARGV)
 {
     my $swallowArg = 0;
diff --git a/bin/hipgenisa.sh b/bin/hipgenisa.sh
new file mode 100755
index 0000000000..e60abbf78f
--- /dev/null
+++ b/bin/hipgenisa.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+if [ $1 = " " ]
+then
+exit
+fi
+
+ROCM_PATH=$1
+GEN_ISA=$2
+FILE_NAMES=$3
+OUT=$4
+OUTPUT_FILE=$5
+TARGET=""
+if [ ${GEN_ISA:0:12} = "--target-isa" ]
+then
+  TARGET=${GEN_ISA:13:12}
+fi
+
+SOURCE="${BASH_SOURCE[0]}"
+HIP_PATH="$( command cd -P "$( dirname "$SOURCE" )/.." && pwd )"
+
+export KMDUMPISA=1
+export KMDUMPLLVM=1
+mkdir /tmp/hipgenisa
+$HIP_PATH/bin/hipcc $FILE_NAMES -o /tmp/hipgenisa/a.out
+mv dump.* /tmp/hipgenisa/
+$ROCM_PATH/hcc-lc/bin/llvm-mc -arch=amdgcn -mcpu=$TARGET -filetype=obj /tmp/hipgenisa/dump.isa -o /tmp/hipgenisa/dump.o
+$ROCM_PATH/llvm/bin/clang -target amdgcn--amdhsa /tmp/hipgenisa/dump.o -o $OUTPUT_FILE
+rm -r /tmp/hipgenisa
+export KMDUMPISA=0
+export KMDUMPLLVM=0
diff --git a/clang-hipify/src/Cuda2Hip.cpp b/clang-hipify/src/Cuda2Hip.cpp
index 45960ac8ca..fde5b7e55b 100644
--- a/clang-hipify/src/Cuda2Hip.cpp
+++ b/clang-hipify/src/Cuda2Hip.cpp
@@ -70,16 +70,27 @@ enum ConvTypes {
   CONV_OTHER,
   CONV_INCLUDE,
   CONV_INCLUDE_CUDA_MAIN_H,
+  CONV_TYPE,
   CONV_LITERAL,
-  CONV_BLAS,
+  CONV_NUMERIC_LITERAL,
   CONV_LAST
 };
 
-const char *counterNames[ConvTypes::CONV_LAST] = {
-    "dev",          "mem",    "kern",    "coord_func",   "math_func",
-    "special_func", "stream", "event",   "err",          "def",
-    "tex",          "other",  "include", "include_cuda_main_header",
-    "literal",      "blas"};
+const char *counterNames[CONV_LAST] = {
+    "dev",          "mem",     "kern",    "coord_func",   "math_func",
+    "special_func", "stream",  "event",   "err",          "def",
+    "tex",          "other",   "include", "include_cuda_main_header",
+    "type",         "literal", "numeric_literal"};
+
+enum ApiTypes {
+  API_DRIVER = 0,
+  API_RUNTIME,
+  API_BLAS,
+  API_LAST
+};
+
+const char *apiNames[API_LAST] = {
+    "CUDA API", "CUDA RT API", "CUDA BLAS API"};
 
 namespace {
 
@@ -90,560 +101,560 @@ struct cuda2hipMap {
     cudaExcludes = {"CHECK_CUDA_ERROR", "CUDA_SAFE_CALL"};
 
     // Defines
-    cuda2hipRename["__CUDACC__"] = {"__HIPCC__", CONV_DEF};
+    cuda2hipRename["__CUDACC__"] = {"__HIPCC__", CONV_DEF, API_RUNTIME};
 
     // CUDA includes
-    cuda2hipRename["cuda.h"]             = {"hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H};
-    cuda2hipRename["cuda_runtime.h"] =     {"hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H};
-    cuda2hipRename["cuda_runtime_api.h"] = {"hip_runtime_api.h", CONV_INCLUDE};
+    cuda2hipRename["cuda.h"]             = {"hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H, API_DRIVER};
+    cuda2hipRename["cuda_runtime.h"] =     {"hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H, API_RUNTIME};
+    cuda2hipRename["cuda_runtime_api.h"] = {"hip_runtime_api.h", CONV_INCLUDE, API_RUNTIME};
 
     // HIP includes
     // TODO: uncomment this when hip/cudacommon.h will be renamed to hip/hipcommon.h
-    //cuda2hipRename["cudacommon.h"] = {"hipcommon.h", CONV_INCLUDE};
+    //cuda2hipRename["cudacommon.h"] = {"hipcommon.h", CONV_INCLUDE, API_RUNTIME};
 
     // CUBLAS includes
-    cuda2hipRename["cublas.h"]           = {"hipblas.h", CONV_INCLUDE};
-    cuda2hipRename["cublas_v2.h"]        = {"hipblas.h", CONV_INCLUDE};
+    cuda2hipRename["cublas.h"]           = {"hipblas.h", CONV_INCLUDE, API_BLAS};
+    cuda2hipRename["cublas_v2.h"]        = {"hipblas.h", CONV_INCLUDE, API_BLAS};
 
     // Error codes and return types
-    cuda2hipRename["cudaError_t"]                          = {"hipError_t", CONV_ERR};
-    cuda2hipRename["cudaError"]                            = {"hipError", CONV_ERR};
-    cuda2hipRename["cudaSuccess"]                          = {"hipSuccess", CONV_ERR};
-    cuda2hipRename["cudaErrorUnknown"]                     = {"hipErrorUnknown", CONV_ERR};
-    cuda2hipRename["cudaErrorMemoryAllocation"]            = {"hipErrorMemoryAllocation", CONV_ERR};
-    cuda2hipRename["cudaErrorMemoryFree"]                  = {"hipErrorMemoryFree", CONV_ERR};
-    cuda2hipRename["cudaErrorUnknownSymbol"]               = {"hipErrorUnknownSymbol", CONV_ERR};
-    cuda2hipRename["cudaErrorOutOfResources"]              = {"hipErrorOutOfResources", CONV_ERR};
-    cuda2hipRename["cudaErrorInvalidValue"]                = {"hipErrorInvalidValue", CONV_ERR};
-    cuda2hipRename["cudaErrorInvalidResourceHandle"]       = {"hipErrorInvalidResourceHandle", CONV_ERR};
-    cuda2hipRename["cudaErrorInvalidDevice"]               = {"hipErrorInvalidDevice", CONV_ERR};
-    cuda2hipRename["cudaErrorInvalidMemcpyDirection"]      = {"hipErrorInvalidMemcpyDirection", CONV_ERR};
-    cuda2hipRename["cudaErrorInvalidDevicePointer"]        = {"hipErrorInvalidDevicePointer", CONV_ERR};
-    cuda2hipRename["cudaErrorInitializationError"]         = {"hipErrorInvalidDevicePointer", CONV_ERR};
-    cuda2hipRename["cudaErrorNoDevice"]                    = {"hipErrorNoDevice", CONV_ERR};
-    cuda2hipRename["cudaErrorNotReady"]                    = {"hipErrorNotReady", CONV_ERR};
-    cuda2hipRename["cudaErrorPeerAccessNotEnabled"]        = {"hipErrorPeerAccessNotEnabled", CONV_ERR};
-    cuda2hipRename["cudaErrorPeerAccessAlreadyEnabled"]    = {"hipErrorPeerAccessAlreadyEnabled", CONV_ERR};
+    cuda2hipRename["cudaError_t"]                          = {"hipError_t", CONV_TYPE, API_RUNTIME};
+    cuda2hipRename["cudaError"]                            = {"hipError", CONV_TYPE, API_RUNTIME};
+    cuda2hipRename["cudaSuccess"]                          = {"hipSuccess", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaErrorUnknown"]                     = {"hipErrorUnknown", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaErrorMemoryAllocation"]            = {"hipErrorMemoryAllocation", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaErrorMemoryFree"]                  = {"hipErrorMemoryFree", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaErrorUnknownSymbol"]               = {"hipErrorUnknownSymbol", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaErrorOutOfResources"]              = {"hipErrorOutOfResources", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaErrorInvalidValue"]                = {"hipErrorInvalidValue", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaErrorInvalidResourceHandle"]       = {"hipErrorInvalidResourceHandle", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaErrorInvalidDevice"]               = {"hipErrorInvalidDevice", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaErrorInvalidMemcpyDirection"]      = {"hipErrorInvalidMemcpyDirection", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaErrorInvalidDevicePointer"]        = {"hipErrorInvalidDevicePointer", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaErrorInitializationError"]         = {"hipErrorInvalidDevicePointer", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaErrorNoDevice"]                    = {"hipErrorNoDevice", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaErrorNotReady"]                    = {"hipErrorNotReady", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaErrorPeerAccessNotEnabled"]        = {"hipErrorPeerAccessNotEnabled", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaErrorPeerAccessAlreadyEnabled"]    = {"hipErrorPeerAccessAlreadyEnabled", CONV_ERR, API_RUNTIME};
     // NOTE: no corresponding error type in CUDA
-    //cuda2hipRename["cudaErrorRuntimeMemory"]               = {"hipErrorRuntimeMemory", CONV_ERR};
-    //cuda2hipRename["cudaErrorRuntimeOther"]                = {"hipErrorRuntimeOther", CONV_ERR};
-    cuda2hipRename["cudaErrorHostMemoryAlreadyRegistered"] = {"hipErrorHostMemoryAlreadyRegistered", CONV_ERR};
-    cuda2hipRename["cudaErrorHostMemoryNotRegistered"]     = {"hipErrorHostMemoryNotRegistered", CONV_ERR};
+    //cuda2hipRename["cudaErrorRuntimeMemory"]               = {"hipErrorRuntimeMemory", CONV_ERR, API_RUNTIME};
+    //cuda2hipRename["cudaErrorRuntimeOther"]                = {"hipErrorRuntimeOther", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaErrorHostMemoryAlreadyRegistered"] = {"hipErrorHostMemoryAlreadyRegistered", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaErrorHostMemoryNotRegistered"]     = {"hipErrorHostMemoryNotRegistered", CONV_ERR, API_RUNTIME};
 
     // Error API
-    cuda2hipRename["cudaGetLastError"]               = {"hipGetLastError", CONV_ERR};
-    cuda2hipRename["cudaPeekAtLastError"]            = {"hipPeekAtLastError", CONV_ERR};
-    cuda2hipRename["cudaGetErrorName"]               = {"hipGetErrorName", CONV_ERR};
-    cuda2hipRename["cudaGetErrorString"]             = {"hipGetErrorString", CONV_ERR};
+    cuda2hipRename["cudaGetLastError"]               = {"hipGetLastError", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaPeekAtLastError"]            = {"hipPeekAtLastError", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaGetErrorName"]               = {"hipGetErrorName", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaGetErrorString"]             = {"hipGetErrorString", CONV_ERR, API_RUNTIME};
 
     // Memcpy
-    cuda2hipRename["cudaMemcpy"]               = {"hipMemcpy", CONV_MEM};
-    cuda2hipRename["cudaMemcpyToSymbol"]       = {"hipMemcpyToSymbol", CONV_MEM};
-    cuda2hipRename["cudaMemset"]               = {"hipMemset", CONV_MEM};
-    cuda2hipRename["cudaMemsetAsync"]          = {"hipMemsetAsync", CONV_MEM};
-    cuda2hipRename["cudaMemcpyAsync"]          = {"hipMemcpyAsync", CONV_MEM};
-    cuda2hipRename["cudaMemGetInfo"]           = {"hipMemGetInfo", CONV_MEM};
+    cuda2hipRename["cudaMemcpy"]               = {"hipMemcpy", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaMemcpyToSymbol"]       = {"hipMemcpyToSymbol", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaMemset"]               = {"hipMemset", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaMemsetAsync"]          = {"hipMemsetAsync", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaMemcpyAsync"]          = {"hipMemcpyAsync", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaMemGetInfo"]           = {"hipMemGetInfo", CONV_MEM, API_RUNTIME};
     // Memcpy kind
-    cuda2hipRename["cudaMemcpyKind"]           = {"hipMemcpyKind", CONV_MEM};
-    cuda2hipRename["cudaMemcpyHostToHost"]     = {"hipMemcpyHostToHost", CONV_MEM};
-    cuda2hipRename["cudaMemcpyHostToDevice"]   = {"hipMemcpyHostToDevice", CONV_MEM};
-    cuda2hipRename["cudaMemcpyDeviceToHost"]   = {"hipMemcpyDeviceToHost", CONV_MEM};
-    cuda2hipRename["cudaMemcpyDeviceToDevice"] = {"hipMemcpyDeviceToDevice", CONV_MEM};
-    cuda2hipRename["cudaMemcpyDefault"]        = {"hipMemcpyDefault", CONV_MEM};
+    cuda2hipRename["cudaMemcpyKind"]           = {"hipMemcpyKind", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaMemcpyHostToHost"]     = {"hipMemcpyHostToHost", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaMemcpyHostToDevice"]   = {"hipMemcpyHostToDevice", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaMemcpyDeviceToHost"]   = {"hipMemcpyDeviceToHost", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaMemcpyDeviceToDevice"] = {"hipMemcpyDeviceToDevice", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaMemcpyDefault"]        = {"hipMemcpyDefault", CONV_MEM, API_RUNTIME};
 
     // Memory management
-    cuda2hipRename["cudaMalloc"]         = {"hipMalloc", CONV_MEM};
-    cuda2hipRename["cudaMallocHost"]     = {"hipHostMalloc", CONV_MEM};
-    cuda2hipRename["cudaFree"]           = {"hipFree", CONV_MEM};
-    cuda2hipRename["cudaFreeHost"]       = {"hipHostFree", CONV_MEM};
-    cuda2hipRename["cudaHostRegister"]   = {"hipHostRegister", CONV_MEM};
-    cuda2hipRename["cudaHostUnregister"] = {"hipHostUnregister", CONV_MEM};
+    cuda2hipRename["cudaMalloc"]         = {"hipMalloc", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaMallocHost"]     = {"hipHostMalloc", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaFree"]           = {"hipFree", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaFreeHost"]       = {"hipHostFree", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaHostRegister"]   = {"hipHostRegister", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaHostUnregister"] = {"hipHostUnregister", CONV_MEM, API_RUNTIME};
 
     // Memory types
-    cuda2hipRename["cudaMemoryType"]       = {"hipMemoryType", CONV_MEM};
-    cuda2hipRename["cudaMemoryTypeHost"]   = {"hipMemoryTypeHost", CONV_MEM};
-    cuda2hipRename["cudaMemoryTypeDevice"] = {"hipMemoryTypeDevice", CONV_MEM};
+    cuda2hipRename["cudaMemoryType"]       = {"hipMemoryType", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaMemoryTypeHost"]   = {"hipMemoryTypeHost", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaMemoryTypeDevice"] = {"hipMemoryTypeDevice", CONV_MEM, API_RUNTIME};
 
     // Host Malloc Flags
-    cuda2hipRename["cudaHostAllocDefault"]       = {"hipHostMallocDefault", CONV_MEM};
-    cuda2hipRename["cudaHostAllocPortable"]      = {"hipHostMallocPortable", CONV_MEM};
-    cuda2hipRename["cudaHostAllocMapped"]        = {"hipHostMallocMapped", CONV_MEM};
-    cuda2hipRename["cudaHostAllocWriteCombined"] = {"hipHostMallocWriteCombined", CONV_MEM};
+    cuda2hipRename["cudaHostAllocDefault"]       = {"hipHostMallocDefault", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaHostAllocPortable"]      = {"hipHostMallocPortable", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaHostAllocMapped"]        = {"hipHostMallocMapped", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaHostAllocWriteCombined"] = {"hipHostMallocWriteCombined", CONV_MEM, API_RUNTIME};
 
     // Host Register Flags
-    cuda2hipRename["cudaHostGetFlags"]         = {"hipHostGetFlags", CONV_MEM};
-    cuda2hipRename["cudaHostRegisterDefault"]  = {"hipHostRegisterDefault", CONV_MEM};
-    cuda2hipRename["cudaHostRegisterPortable"] = {"hipHostRegisterPortable", CONV_MEM};
-    cuda2hipRename["cudaHostRegisterMapped"]   = {"hipHostRegisterMapped", CONV_MEM};
-    cuda2hipRename["cudaHostRegisterIoMemory"] = {"hipHostRegisterIoMemory", CONV_MEM};
+    cuda2hipRename["cudaHostGetFlags"]         = {"hipHostGetFlags", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaHostRegisterDefault"]  = {"hipHostRegisterDefault", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaHostRegisterPortable"] = {"hipHostRegisterPortable", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaHostRegisterMapped"]   = {"hipHostRegisterMapped", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaHostRegisterIoMemory"] = {"hipHostRegisterIoMemory", CONV_MEM, API_RUNTIME};
 
     // Coordinate Indexing and Dimensions
-    cuda2hipRename["threadIdx.x"] = {"hipThreadIdx_x", CONV_COORD_FUNC};
-    cuda2hipRename["threadIdx.y"] = {"hipThreadIdx_y", CONV_COORD_FUNC};
-    cuda2hipRename["threadIdx.z"] = {"hipThreadIdx_z", CONV_COORD_FUNC};
+    cuda2hipRename["threadIdx.x"] = {"hipThreadIdx_x", CONV_COORD_FUNC, API_RUNTIME};
+    cuda2hipRename["threadIdx.y"] = {"hipThreadIdx_y", CONV_COORD_FUNC, API_RUNTIME};
+    cuda2hipRename["threadIdx.z"] = {"hipThreadIdx_z", CONV_COORD_FUNC, API_RUNTIME};
 
-    cuda2hipRename["blockIdx.x"]  = {"hipBlockIdx_x", CONV_COORD_FUNC};
-    cuda2hipRename["blockIdx.y"]  = {"hipBlockIdx_y", CONV_COORD_FUNC};
-    cuda2hipRename["blockIdx.z"]  = {"hipBlockIdx_z", CONV_COORD_FUNC};
+    cuda2hipRename["blockIdx.x"]  = {"hipBlockIdx_x", CONV_COORD_FUNC, API_RUNTIME};
+    cuda2hipRename["blockIdx.y"]  = {"hipBlockIdx_y", CONV_COORD_FUNC, API_RUNTIME};
+    cuda2hipRename["blockIdx.z"]  = {"hipBlockIdx_z", CONV_COORD_FUNC, API_RUNTIME};
 
-    cuda2hipRename["blockDim.x"]  = {"hipBlockDim_x", CONV_COORD_FUNC};
-    cuda2hipRename["blockDim.y"]  = {"hipBlockDim_y", CONV_COORD_FUNC};
-    cuda2hipRename["blockDim.z"]  = {"hipBlockDim_z", CONV_COORD_FUNC};
+    cuda2hipRename["blockDim.x"]  = {"hipBlockDim_x", CONV_COORD_FUNC, API_RUNTIME};
+    cuda2hipRename["blockDim.y"]  = {"hipBlockDim_y", CONV_COORD_FUNC, API_RUNTIME};
+    cuda2hipRename["blockDim.z"]  = {"hipBlockDim_z", CONV_COORD_FUNC, API_RUNTIME};
 
-    cuda2hipRename["gridDim.x"]   = {"hipGridDim_x", CONV_COORD_FUNC};
-    cuda2hipRename["gridDim.y"]   = {"hipGridDim_y", CONV_COORD_FUNC};
-    cuda2hipRename["gridDim.z"]   = {"hipGridDim_z", CONV_COORD_FUNC};
+    cuda2hipRename["gridDim.x"]   = {"hipGridDim_x", CONV_COORD_FUNC, API_RUNTIME};
+    cuda2hipRename["gridDim.y"]   = {"hipGridDim_y", CONV_COORD_FUNC, API_RUNTIME};
+    cuda2hipRename["gridDim.z"]   = {"hipGridDim_z", CONV_COORD_FUNC, API_RUNTIME};
 
-    cuda2hipRename["blockIdx.x"]  = {"hipBlockIdx_x", CONV_COORD_FUNC};
-    cuda2hipRename["blockIdx.y"]  = {"hipBlockIdx_y", CONV_COORD_FUNC};
-    cuda2hipRename["blockIdx.z"]  = {"hipBlockIdx_z", CONV_COORD_FUNC};
+    cuda2hipRename["blockIdx.x"]  = {"hipBlockIdx_x", CONV_COORD_FUNC, API_RUNTIME};
+    cuda2hipRename["blockIdx.y"]  = {"hipBlockIdx_y", CONV_COORD_FUNC, API_RUNTIME};
+    cuda2hipRename["blockIdx.z"]  = {"hipBlockIdx_z", CONV_COORD_FUNC, API_RUNTIME};
 
-    cuda2hipRename["blockDim.x"]  = {"hipBlockDim_x", CONV_COORD_FUNC};
-    cuda2hipRename["blockDim.y"]  = {"hipBlockDim_y", CONV_COORD_FUNC};
-    cuda2hipRename["blockDim.z"]  = {"hipBlockDim_z", CONV_COORD_FUNC};
+    cuda2hipRename["blockDim.x"]  = {"hipBlockDim_x", CONV_COORD_FUNC, API_RUNTIME};
+    cuda2hipRename["blockDim.y"]  = {"hipBlockDim_y", CONV_COORD_FUNC, API_RUNTIME};
+    cuda2hipRename["blockDim.z"]  = {"hipBlockDim_z", CONV_COORD_FUNC, API_RUNTIME};
 
-    cuda2hipRename["gridDim.x"]   = {"hipGridDim_x", CONV_COORD_FUNC};
-    cuda2hipRename["gridDim.y"]   = {"hipGridDim_y", CONV_COORD_FUNC};
-    cuda2hipRename["gridDim.z"]   = {"hipGridDim_z", CONV_COORD_FUNC};
+    cuda2hipRename["gridDim.x"]   = {"hipGridDim_x", CONV_COORD_FUNC, API_RUNTIME};
+    cuda2hipRename["gridDim.y"]   = {"hipGridDim_y", CONV_COORD_FUNC, API_RUNTIME};
+    cuda2hipRename["gridDim.z"]   = {"hipGridDim_z", CONV_COORD_FUNC, API_RUNTIME};
 
-    cuda2hipRename["warpSize"]    = {"hipWarpSize", CONV_SPECIAL_FUNC};
+    cuda2hipRename["warpSize"]    = {"hipWarpSize", CONV_SPECIAL_FUNC, API_RUNTIME};
 
     // Events
-    cuda2hipRename["cudaEvent_t"]              = {"hipEvent_t", CONV_EVENT};
-    cuda2hipRename["cudaEventCreate"]          = {"hipEventCreate", CONV_EVENT};
-    cuda2hipRename["cudaEventCreateWithFlags"] = {"hipEventCreateWithFlags", CONV_EVENT};
-    cuda2hipRename["cudaEventDestroy"]         = {"hipEventDestroy", CONV_EVENT};
-    cuda2hipRename["cudaEventRecord"]          = {"hipEventRecord", CONV_EVENT};
-    cuda2hipRename["cudaEventElapsedTime"]     = {"hipEventElapsedTime", CONV_EVENT};
-    cuda2hipRename["cudaEventSynchronize"]     = {"hipEventSynchronize", CONV_EVENT};
-    cuda2hipRename["cudaEventQuery"]           = {"hipEventQuery", CONV_EVENT};
+    cuda2hipRename["cudaEvent_t"]              = {"hipEvent_t", CONV_TYPE, API_RUNTIME};
+    cuda2hipRename["cudaEventCreate"]          = {"hipEventCreate", CONV_EVENT, API_RUNTIME};
+    cuda2hipRename["cudaEventCreateWithFlags"] = {"hipEventCreateWithFlags", CONV_EVENT, API_RUNTIME};
+    cuda2hipRename["cudaEventDestroy"]         = {"hipEventDestroy", CONV_EVENT, API_RUNTIME};
+    cuda2hipRename["cudaEventRecord"]          = {"hipEventRecord", CONV_EVENT, API_RUNTIME};
+    cuda2hipRename["cudaEventElapsedTime"]     = {"hipEventElapsedTime", CONV_EVENT, API_RUNTIME};
+    cuda2hipRename["cudaEventSynchronize"]     = {"hipEventSynchronize", CONV_EVENT, API_RUNTIME};
+    cuda2hipRename["cudaEventQuery"]           = {"hipEventQuery", CONV_EVENT, API_RUNTIME};
     // Event Flags
-    cuda2hipRename["cudaEventDefault"]         = {"hipEventDefault", CONV_EVENT};
-    cuda2hipRename["cudaEventBlockingSync"]    = {"hipEventBlockingSync", CONV_EVENT};
-    cuda2hipRename["cudaEventDisableTiming"]   = {"hipEventDisableTiming", CONV_EVENT};
-    cuda2hipRename["cudaEventInterprocess"]    = {"hipEventInterprocess", CONV_EVENT};
+    cuda2hipRename["cudaEventDefault"]         = {"hipEventDefault", CONV_EVENT, API_RUNTIME};
+    cuda2hipRename["cudaEventBlockingSync"]    = {"hipEventBlockingSync", CONV_EVENT, API_RUNTIME};
+    cuda2hipRename["cudaEventDisableTiming"]   = {"hipEventDisableTiming", CONV_EVENT, API_RUNTIME};
+    cuda2hipRename["cudaEventInterprocess"]    = {"hipEventInterprocess", CONV_EVENT, API_RUNTIME};
 
     // Streams
-    cuda2hipRename["cudaStream_t"]              = {"hipStream_t", CONV_STREAM};
-    cuda2hipRename["cudaStreamCreate"]          = {"hipStreamCreate", CONV_STREAM};
-    cuda2hipRename["cudaStreamCreateWithFlags"] = {"hipStreamCreateWithFlags", CONV_STREAM};
-    cuda2hipRename["cudaStreamDestroy"]         = {"hipStreamDestroy", CONV_STREAM};
-    cuda2hipRename["cudaStreamWaitEvent"]       = {"hipStreamWaitEvent", CONV_STREAM};
-    cuda2hipRename["cudaStreamSynchronize"]     = {"hipStreamSynchronize", CONV_STREAM};
+    cuda2hipRename["cudaStream_t"]              = {"hipStream_t", CONV_TYPE, API_RUNTIME};
+    cuda2hipRename["cudaStreamCreate"]          = {"hipStreamCreate", CONV_STREAM, API_RUNTIME};
+    cuda2hipRename["cudaStreamCreateWithFlags"] = {"hipStreamCreateWithFlags", CONV_STREAM, API_RUNTIME};
+    cuda2hipRename["cudaStreamDestroy"]         = {"hipStreamDestroy", CONV_STREAM, API_RUNTIME};
+    cuda2hipRename["cudaStreamWaitEvent"]       = {"hipStreamWaitEvent", CONV_STREAM, API_RUNTIME};
+    cuda2hipRename["cudaStreamSynchronize"]     = {"hipStreamSynchronize", CONV_STREAM, API_RUNTIME};
     // Stream Flags
-    cuda2hipRename["cudaStreamGetFlags"]        = {"hipStreamGetFlags", CONV_STREAM};
-    cuda2hipRename["cudaStreamDefault"]         = {"hipStreamDefault", CONV_STREAM};
-    cuda2hipRename["cudaStreamNonBlocking"]     = {"hipStreamNonBlocking", CONV_STREAM};
+    cuda2hipRename["cudaStreamGetFlags"]        = {"hipStreamGetFlags", CONV_STREAM, API_RUNTIME};
+    cuda2hipRename["cudaStreamDefault"]         = {"hipStreamDefault", CONV_STREAM, API_RUNTIME};
+    cuda2hipRename["cudaStreamNonBlocking"]     = {"hipStreamNonBlocking", CONV_STREAM, API_RUNTIME};
 
     // Other synchronization
-    cuda2hipRename["cudaDeviceSynchronize"] = {"hipDeviceSynchronize", CONV_DEV};
+    cuda2hipRename["cudaDeviceSynchronize"] = {"hipDeviceSynchronize", CONV_DEV, API_RUNTIME};
     // translate deprecated cudaThreadSynchronize
-    cuda2hipRename["cudaThreadSynchronize"] = {"hipDeviceSynchronize", CONV_DEV};
-    cuda2hipRename["cudaDeviceReset"]       = {"hipDeviceReset", CONV_DEV};
+    cuda2hipRename["cudaThreadSynchronize"] = {"hipDeviceSynchronize", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDeviceReset"]       = {"hipDeviceReset", CONV_DEV, API_RUNTIME};
     // translate deprecated cudaThreadExit
-    cuda2hipRename["cudaThreadExit"]        = {"hipDeviceReset", CONV_DEV};
-    cuda2hipRename["cudaSetDevice"]         = {"hipSetDevice", CONV_DEV};
-    cuda2hipRename["cudaGetDevice"]         = {"hipGetDevice", CONV_DEV};
+    cuda2hipRename["cudaThreadExit"]        = {"hipDeviceReset", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaSetDevice"]         = {"hipSetDevice", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaGetDevice"]         = {"hipGetDevice", CONV_DEV, API_RUNTIME};
 
     // Attributes
-    cuda2hipRename["cudaDeviceAttr"]          = {"hipDeviceAttribute_t", CONV_DEV};
-    cuda2hipRename["cudaDeviceGetAttribute"]  = {"hipDeviceGetAttribute", CONV_DEV};
+    cuda2hipRename["cudaDeviceAttr"]          = {"hipDeviceAttribute_t", CONV_TYPE, API_RUNTIME};
+    cuda2hipRename["cudaDeviceGetAttribute"]  = {"hipDeviceGetAttribute", CONV_DEV, API_RUNTIME};
 
-    cuda2hipRename["cudaDevAttrMaxThreadsPerBlock"]               = {"hipDeviceAttributeMaxThreadsPerBlock", CONV_DEV};
-    cuda2hipRename["cudaDevAttrMaxBlockDimX"]                     = {"hipDeviceAttributeMaxBlockDimX", CONV_DEV};
-    cuda2hipRename["cudaDevAttrMaxBlockDimY"]                     = {"hipDeviceAttributeMaxBlockDimY", CONV_DEV};
-    cuda2hipRename["cudaDevAttrMaxBlockDimZ"]                     = {"hipDeviceAttributeMaxBlockDimZ", CONV_DEV};
-    cuda2hipRename["cudaDevAttrMaxGridDimX"]                      = {"hipDeviceAttributeMaxGridDimX", CONV_DEV};
-    cuda2hipRename["cudaDevAttrMaxGridDimY"]                      = {"hipDeviceAttributeMaxGridDimY", CONV_DEV};
-    cuda2hipRename["cudaDevAttrMaxGridDimZ"]                      = {"hipDeviceAttributeMaxGridDimZ", CONV_DEV};
-    cuda2hipRename["cudaDevAttrMaxSharedMemoryPerBlock"]          = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV};
-    cuda2hipRename["cudaDevAttrTotalConstantMemory"]              = {"hipDeviceAttributeTotalConstantMemory", CONV_DEV};
-    cuda2hipRename["cudaDevAttrWarpSize"]                         = {"hipDeviceAttributeWarpSize", CONV_DEV};
-    cuda2hipRename["cudaDevAttrMaxRegistersPerBlock"]             = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV};
-    cuda2hipRename["cudaDevAttrClockRate"]                        = {"hipDeviceAttributeClockRate", CONV_DEV};
-    cuda2hipRename["cudaDevAttrMemoryClockRate"]                  = {"hipDeviceAttributeMemoryClockRate", CONV_DEV};
-    cuda2hipRename["cudaDevAttrGlobalMemoryBusWidth"]             = {"hipDeviceAttributeMemoryBusWidth", CONV_DEV};
-    cuda2hipRename["cudaDevAttrMultiProcessorCount"]              = {"hipDeviceAttributeMultiprocessorCount", CONV_DEV};
-    cuda2hipRename["cudaDevAttrComputeMode"]                      = {"hipDeviceAttributeComputeMode", CONV_DEV};
-    cuda2hipRename["cudaDevAttrL2CacheSize"]                      = {"hipDeviceAttributeL2CacheSize", CONV_DEV};
-    cuda2hipRename["cudaDevAttrMaxThreadsPerMultiProcessor"]      = {"hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_DEV};
-    cuda2hipRename["cudaDevAttrComputeCapabilityMajor"]           = {"hipDeviceAttributeComputeCapabilityMajor", CONV_DEV};
-    cuda2hipRename["cudaDevAttrConcurrentKernels"]                = {"hipDeviceAttributeConcurrentKernels", CONV_DEV};
-    cuda2hipRename["cudaDevAttrPciBusId"]                         = {"hipDeviceAttributePciBusId", CONV_DEV};
-    cuda2hipRename["cudaDevAttrPciDeviceId"]                      = {"hipDeviceAttributePciDeviceId", CONV_DEV};
-    cuda2hipRename["cudaDevAttrMaxSharedMemoryPerMultiprocessor"] = {"hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", CONV_DEV};
-    cuda2hipRename["cudaDevAttrIsMultiGpuBoard"]                  = {"hipDeviceAttributeIsMultiGpuBoard", CONV_DEV};
+    cuda2hipRename["cudaDevAttrMaxThreadsPerBlock"]               = {"hipDeviceAttributeMaxThreadsPerBlock", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMaxBlockDimX"]                     = {"hipDeviceAttributeMaxBlockDimX", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMaxBlockDimY"]                     = {"hipDeviceAttributeMaxBlockDimY", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMaxBlockDimZ"]                     = {"hipDeviceAttributeMaxBlockDimZ", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMaxGridDimX"]                      = {"hipDeviceAttributeMaxGridDimX", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMaxGridDimY"]                      = {"hipDeviceAttributeMaxGridDimY", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMaxGridDimZ"]                      = {"hipDeviceAttributeMaxGridDimZ", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMaxSharedMemoryPerBlock"]          = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrTotalConstantMemory"]              = {"hipDeviceAttributeTotalConstantMemory", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrWarpSize"]                         = {"hipDeviceAttributeWarpSize", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMaxRegistersPerBlock"]             = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrClockRate"]                        = {"hipDeviceAttributeClockRate", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMemoryClockRate"]                  = {"hipDeviceAttributeMemoryClockRate", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrGlobalMemoryBusWidth"]             = {"hipDeviceAttributeMemoryBusWidth", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMultiProcessorCount"]              = {"hipDeviceAttributeMultiprocessorCount", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrComputeMode"]                      = {"hipDeviceAttributeComputeMode", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrL2CacheSize"]                      = {"hipDeviceAttributeL2CacheSize", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMaxThreadsPerMultiProcessor"]      = {"hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrComputeCapabilityMajor"]           = {"hipDeviceAttributeComputeCapabilityMajor", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrConcurrentKernels"]                = {"hipDeviceAttributeConcurrentKernels", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrPciBusId"]                         = {"hipDeviceAttributePciBusId", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrPciDeviceId"]                      = {"hipDeviceAttributePciDeviceId", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMaxSharedMemoryPerMultiprocessor"] = {"hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrIsMultiGpuBoard"]                  = {"hipDeviceAttributeIsMultiGpuBoard", CONV_DEV, API_RUNTIME};
 
     // Pointer Attributes
-    cuda2hipRename["cudaPointerAttributes"]      = {"hipPointerAttribute_t", CONV_MEM};
-    cuda2hipRename["cudaPointerGetAttributes"]   = {"hipPointerGetAttributes", CONV_MEM};
+    cuda2hipRename["cudaPointerAttributes"]      = {"hipPointerAttribute_t", CONV_TYPE, API_RUNTIME};
+    cuda2hipRename["cudaPointerGetAttributes"]   = {"hipPointerGetAttributes", CONV_MEM, API_RUNTIME};
 
-    cuda2hipRename["cudaHostGetDevicePointer"]   = {"hipHostGetDevicePointer", CONV_MEM};
+    cuda2hipRename["cudaHostGetDevicePointer"]   = {"hipHostGetDevicePointer", CONV_MEM, API_RUNTIME};
 
     // Device
-    cuda2hipRename["cudaDeviceProp"]            = {"hipDeviceProp_t", CONV_DEV};
-    cuda2hipRename["cudaGetDeviceProperties"]   = {"hipGetDeviceProperties", CONV_DEV};
+    cuda2hipRename["cudaDeviceProp"]            = {"hipDeviceProp_t", CONV_TYPE, API_RUNTIME};
+    cuda2hipRename["cudaGetDeviceProperties"]   = {"hipGetDeviceProperties", CONV_DEV, API_RUNTIME};
 
     // Device Flags
-    cuda2hipRename["cudaSetDeviceFlags"]               = {"hipSetDeviceFlags", CONV_DEV};
-    cuda2hipRename["cudaDeviceScheduleAuto"]           = {"hipDeviceScheduleAuto", CONV_DEV};
-    cuda2hipRename["cudaDeviceScheduleSpin"]           = {"hipDeviceScheduleSpin", CONV_DEV};
-    cuda2hipRename["cudaDeviceScheduleYield"]          = {"hipDeviceScheduleYield", CONV_DEV};
+    cuda2hipRename["cudaSetDeviceFlags"]               = {"hipSetDeviceFlags", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDeviceScheduleAuto"]           = {"hipDeviceScheduleAuto", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDeviceScheduleSpin"]           = {"hipDeviceScheduleSpin", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDeviceScheduleYield"]          = {"hipDeviceScheduleYield", CONV_DEV, API_RUNTIME};
     // deprecated as of CUDA 4.0 and replaced with cudaDeviceScheduleBlockingSync
-    cuda2hipRename["cudaDeviceBlockingSync"]           = {"hipDeviceBlockingSync", CONV_DEV};
+    cuda2hipRename["cudaDeviceBlockingSync"]           = {"hipDeviceBlockingSync", CONV_DEV, API_RUNTIME};
     // unsupported yet
-    //cuda2hipRename["cudaDeviceScheduleBlockingSync"] = {"hipDeviceScheduleBlockingSync", CONV_DEV};
-    //cuda2hipRename["cudaDeviceScheduleMask"]         = {"hipDeviceScheduleMask", CONV_DEV};
-    cuda2hipRename["cudaDeviceMapHost"]                = {"hipDeviceMapHost", CONV_DEV};
+    //cuda2hipRename["cudaDeviceScheduleBlockingSync"] = {"hipDeviceScheduleBlockingSync", CONV_DEV, API_RUNTIME};
+    //cuda2hipRename["cudaDeviceScheduleMask"]         = {"hipDeviceScheduleMask", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDeviceMapHost"]                = {"hipDeviceMapHost", CONV_DEV, API_RUNTIME};
     // unsupported yet
-    //cuda2hipRename["cudaDeviceLmemResizeToMax"]      = {"hipDeviceLmemResizeToMax", CONV_DEV};
-    //cuda2hipRename["cudaDeviceMask"]                 = {"hipDeviceMask", CONV_DEV};
+    //cuda2hipRename["cudaDeviceLmemResizeToMax"]      = {"hipDeviceLmemResizeToMax", CONV_DEV, API_RUNTIME};
+    //cuda2hipRename["cudaDeviceMask"]                 = {"hipDeviceMask", CONV_DEV, API_RUNTIME};
 
     // Cache config
-    cuda2hipRename["cudaDeviceSetCacheConfig"]  = {"hipDeviceSetCacheConfig", CONV_DEV};
+    cuda2hipRename["cudaDeviceSetCacheConfig"]  = {"hipDeviceSetCacheConfig", CONV_DEV, API_RUNTIME};
     // translate deprecated
-    cuda2hipRename["cudaThreadSetCacheConfig"]  = {"hipDeviceSetCacheConfig", CONV_DEV};
-    cuda2hipRename["cudaDeviceGetCacheConfig"]  = {"hipDeviceGetCacheConfig", CONV_DEV};
+    cuda2hipRename["cudaThreadSetCacheConfig"]  = {"hipDeviceSetCacheConfig", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDeviceGetCacheConfig"]  = {"hipDeviceGetCacheConfig", CONV_DEV, API_RUNTIME};
     // translate deprecated
-    cuda2hipRename["cudaThreadGetCacheConfig"]  = {"hipDeviceGetCacheConfig", CONV_DEV};
-    cuda2hipRename["cudaFuncCache"]             = {"hipFuncCache", CONV_DEV};
-    cuda2hipRename["cudaFuncCachePreferNone"]   = {"hipFuncCachePreferNone", CONV_DEV};
-    cuda2hipRename["cudaFuncCachePreferShared"] = {"hipFuncCachePreferShared", CONV_DEV};
-    cuda2hipRename["cudaFuncCachePreferL1"]     = {"hipFuncCachePreferL1", CONV_DEV};
-    cuda2hipRename["cudaFuncCachePreferEqual"]  = {"hipFuncCachePreferEqual", CONV_DEV};
-    cuda2hipRename["cudaFuncSetCacheConfig"]    = {"hipFuncSetCacheConfig", CONV_DEV};
+    cuda2hipRename["cudaThreadGetCacheConfig"]  = {"hipDeviceGetCacheConfig", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaFuncCache"]             = {"hipFuncCache", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaFuncCachePreferNone"]   = {"hipFuncCachePreferNone", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaFuncCachePreferShared"] = {"hipFuncCachePreferShared", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaFuncCachePreferL1"]     = {"hipFuncCachePreferL1", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaFuncCachePreferEqual"]  = {"hipFuncCachePreferEqual", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaFuncSetCacheConfig"]    = {"hipFuncSetCacheConfig", CONV_DEV, API_RUNTIME};
 
     // Driver/Runtime
-    cuda2hipRename["cudaDriverGetVersion"] = {"hipDriverGetVersion", CONV_DEV};
-    cuda2hipRename["cudaGetDeviceCount"]   = {"hipGetDeviceCount", CONV_DEV};
+    cuda2hipRename["cudaDriverGetVersion"] = {"hipDriverGetVersion", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaGetDeviceCount"]   = {"hipGetDeviceCount", CONV_DEV, API_RUNTIME};
     // unsupported yet
-    //cuda2hipRename["cudaRuntimeGetVersion"] = {"hipRuntimeGetVersion", CONV_DEV};
+    //cuda2hipRename["cudaRuntimeGetVersion"] = {"hipRuntimeGetVersion", CONV_DEV, API_RUNTIME};
 
     // Peer2Peer
-    cuda2hipRename["cudaDeviceCanAccessPeer"]     = {"hipDeviceCanAccessPeer", CONV_DEV};
-    cuda2hipRename["cudaDeviceDisablePeerAccess"] = {"hipDeviceDisablePeerAccess", CONV_DEV};
-    cuda2hipRename["cudaDeviceEnablePeerAccess"]  = {"hipDeviceEnablePeerAccess", CONV_DEV};
-    cuda2hipRename["cudaMemcpyPeerAsync"]         = {"hipMemcpyPeerAsync", CONV_MEM};
-    cuda2hipRename["cudaMemcpyPeer"]              = {"hipMemcpyPeer", CONV_MEM};
+    cuda2hipRename["cudaDeviceCanAccessPeer"]     = {"hipDeviceCanAccessPeer", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDeviceDisablePeerAccess"] = {"hipDeviceDisablePeerAccess", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDeviceEnablePeerAccess"]  = {"hipDeviceEnablePeerAccess", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaMemcpyPeerAsync"]         = {"hipMemcpyPeerAsync", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaMemcpyPeer"]              = {"hipMemcpyPeer", CONV_MEM, API_RUNTIME};
 
     // Shared memory
-    cuda2hipRename["cudaDeviceSetSharedMemConfig"]   = {"hipDeviceSetSharedMemConfig", CONV_DEV};
+    cuda2hipRename["cudaDeviceSetSharedMemConfig"]   = {"hipDeviceSetSharedMemConfig", CONV_DEV, API_RUNTIME};
     // translate deprecated
-    cuda2hipRename["cudaThreadSetSharedMemConfig"]   = {"hipDeviceSetSharedMemConfig", CONV_DEV};
-    cuda2hipRename["cudaDeviceGetSharedMemConfig"]   = {"hipDeviceGetSharedMemConfig", CONV_DEV};
+    cuda2hipRename["cudaThreadSetSharedMemConfig"]   = {"hipDeviceSetSharedMemConfig", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDeviceGetSharedMemConfig"]   = {"hipDeviceGetSharedMemConfig", CONV_DEV, API_RUNTIME};
     // translate deprecated
-    cuda2hipRename["cudaThreadGetSharedMemConfig"]   = {"hipDeviceGetSharedMemConfig", CONV_DEV};
-    cuda2hipRename["cudaSharedMemConfig"]            = {"hipSharedMemConfig", CONV_DEV};
-    cuda2hipRename["cudaSharedMemBankSizeDefault"]   = {"hipSharedMemBankSizeDefault", CONV_DEV};
-    cuda2hipRename["cudaSharedMemBankSizeFourByte"]  = {"hipSharedMemBankSizeFourByte", CONV_DEV};
-    cuda2hipRename["cudaSharedMemBankSizeEightByte"] = {"hipSharedMemBankSizeEightByte", CONV_DEV};
+    cuda2hipRename["cudaThreadGetSharedMemConfig"]   = {"hipDeviceGetSharedMemConfig", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaSharedMemConfig"]            = {"hipSharedMemConfig", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaSharedMemBankSizeDefault"]   = {"hipSharedMemBankSizeDefault", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaSharedMemBankSizeFourByte"]  = {"hipSharedMemBankSizeFourByte", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaSharedMemBankSizeEightByte"] = {"hipSharedMemBankSizeEightByte", CONV_DEV, API_RUNTIME};
 
     // Profiler
     // unsupported yet
-    //cuda2hipRename["cudaProfilerInitialize"]  = {"hipProfilerInitialize", CONV_OTHER};
-    cuda2hipRename["cudaProfilerStart"]       = {"hipProfilerStart", CONV_OTHER};
-    cuda2hipRename["cudaProfilerStop"]        = {"hipProfilerStop", CONV_OTHER};
-    cuda2hipRename["cudaChannelFormatDesc"]   = {"hipChannelFormatDesc", CONV_TEX};
-    cuda2hipRename["cudaFilterModePoint"]     = {"hipFilterModePoint", CONV_TEX};
-    cuda2hipRename["cudaReadModeElementType"] = {"hipReadModeElementType", CONV_TEX};
+    //cuda2hipRename["cudaProfilerInitialize"]  = {"hipProfilerInitialize", CONV_OTHER, API_RUNTIME};
+    cuda2hipRename["cudaProfilerStart"]       = {"hipProfilerStart", CONV_OTHER, API_RUNTIME};
+    cuda2hipRename["cudaProfilerStop"]        = {"hipProfilerStop", CONV_OTHER, API_RUNTIME};
+    cuda2hipRename["cudaChannelFormatDesc"]   = {"hipChannelFormatDesc", CONV_TEX, API_RUNTIME};
+    cuda2hipRename["cudaFilterModePoint"]     = {"hipFilterModePoint", CONV_TEX, API_RUNTIME};
+    cuda2hipRename["cudaReadModeElementType"] = {"hipReadModeElementType", CONV_TEX, API_RUNTIME};
 
     // Channel descriptor
-    cuda2hipRename["cudaCreateChannelDesc"]   = {"hipCreateChannelDesc", CONV_TEX};
-    cuda2hipRename["cudaBindTexture"]         = {"hipBindTexture", CONV_TEX};
-    cuda2hipRename["cudaUnbindTexture"]       = {"hipUnbindTexture", CONV_TEX};
+    cuda2hipRename["cudaCreateChannelDesc"]   = {"hipCreateChannelDesc", CONV_TEX, API_RUNTIME};
+    cuda2hipRename["cudaBindTexture"]         = {"hipBindTexture", CONV_TEX, API_RUNTIME};
+    cuda2hipRename["cudaUnbindTexture"]       = {"hipUnbindTexture", CONV_TEX, API_RUNTIME};
 
     //---------------------------------------BLAS-------------------------------------//
     // Blas types
-    cuda2hipRename["cublasHandle_t"]                 = {"hipblasHandle_t", CONV_BLAS};
+    cuda2hipRename["cublasHandle_t"]                 = {"hipblasHandle_t", CONV_TYPE, API_BLAS};
     // Blas operations
-    cuda2hipRename["cublasOperation_t"]              = {"hipblasOperation_t", CONV_BLAS};
-    cuda2hipRename["CUBLAS_OP_N"]                    = {"HIPBLAS_OP_N", CONV_BLAS};
-    cuda2hipRename["CUBLAS_OP_T"]                    = {"HIPBLAS_OP_T", CONV_BLAS};
-    cuda2hipRename["CUBLAS_OP_C"]                    = {"HIPBLAS_OP_C", CONV_BLAS};
+    cuda2hipRename["cublasOperation_t"]              = {"hipblasOperation_t", CONV_TYPE, API_BLAS};
+    cuda2hipRename["CUBLAS_OP_N"]                    = {"HIPBLAS_OP_N", CONV_NUMERIC_LITERAL, API_BLAS};
+    cuda2hipRename["CUBLAS_OP_T"]                    = {"HIPBLAS_OP_T", CONV_NUMERIC_LITERAL, API_BLAS};
+    cuda2hipRename["CUBLAS_OP_C"]                    = {"HIPBLAS_OP_C", CONV_NUMERIC_LITERAL, API_BLAS};
     // Blas statuses
-    cuda2hipRename["cublasStatus_t"]                 = {"hipblasStatus_t", CONV_BLAS};
-    cuda2hipRename["CUBLAS_STATUS_SUCCESS"]          = {"HIPBLAS_STATUS_SUCCESS", CONV_BLAS};
-    cuda2hipRename["CUBLAS_STATUS_NOT_INITIALIZED"]  = {"HIPBLAS_STATUS_NOT_INITIALIZED", CONV_BLAS};
-    cuda2hipRename["CUBLAS_STATUS_ALLOC_FAILED"]     = {"HIPBLAS_STATUS_ALLOC_FAILED", CONV_BLAS};
-    cuda2hipRename["CUBLAS_STATUS_INVALID_VALUE"]    = {"HIPBLAS_STATUS_INVALID_VALUE", CONV_BLAS};
-    cuda2hipRename["CUBLAS_STATUS_MAPPING_ERROR"]    = {"HIPBLAS_STATUS_MAPPING_ERROR", CONV_BLAS};
-    cuda2hipRename["CUBLAS_STATUS_EXECUTION_FAILED"] = {"HIPBLAS_STATUS_EXECUTION_FAILED", CONV_BLAS};
-    cuda2hipRename["CUBLAS_STATUS_INTERNAL_ERROR"]   = {"HIPBLAS_STATUS_INTERNAL_ERROR", CONV_BLAS};
-    cuda2hipRename["CUBLAS_STATUS_NOT_SUPPORTED"]    = {"HIPBLAS_STATUS_INTERNAL_ERROR", CONV_BLAS};
+    cuda2hipRename["cublasStatus_t"]                 = {"hipblasStatus_t", CONV_TYPE, API_BLAS};
+    cuda2hipRename["CUBLAS_STATUS_SUCCESS"]          = {"HIPBLAS_STATUS_SUCCESS", CONV_NUMERIC_LITERAL, API_BLAS};
+    cuda2hipRename["CUBLAS_STATUS_NOT_INITIALIZED"]  = {"HIPBLAS_STATUS_NOT_INITIALIZED", CONV_NUMERIC_LITERAL, API_BLAS};
+    cuda2hipRename["CUBLAS_STATUS_ALLOC_FAILED"]     = {"HIPBLAS_STATUS_ALLOC_FAILED", CONV_NUMERIC_LITERAL, API_BLAS};
+    cuda2hipRename["CUBLAS_STATUS_INVALID_VALUE"]    = {"HIPBLAS_STATUS_INVALID_VALUE", CONV_NUMERIC_LITERAL, API_BLAS};
+    cuda2hipRename["CUBLAS_STATUS_MAPPING_ERROR"]    = {"HIPBLAS_STATUS_MAPPING_ERROR", CONV_NUMERIC_LITERAL, API_BLAS};
+    cuda2hipRename["CUBLAS_STATUS_EXECUTION_FAILED"] = {"HIPBLAS_STATUS_EXECUTION_FAILED", CONV_NUMERIC_LITERAL, API_BLAS};
+    cuda2hipRename["CUBLAS_STATUS_INTERNAL_ERROR"]   = {"HIPBLAS_STATUS_INTERNAL_ERROR", CONV_NUMERIC_LITERAL, API_BLAS};
+    cuda2hipRename["CUBLAS_STATUS_NOT_SUPPORTED"]    = {"HIPBLAS_STATUS_INTERNAL_ERROR", CONV_NUMERIC_LITERAL, API_BLAS};
     // Blas Fill Modes
     // unsupported yet by hipblas/hcblas
-    //cuda2hipRename["cublasFillMode_t"]               = {"hipblasFillMode_t", CONV_BLAS};
-    //cuda2hipRename["CUBLAS_FILL_MODE_LOWER"]         = {"HIPBLAS_FILL_MODE_LOWER", CONV_BLAS};
-    //cuda2hipRename["CUBLAS_FILL_MODE_UPPER"]         = {"HIPBLAS_FILL_MODE_UPPER", CONV_BLAS};
+    //cuda2hipRename["cublasFillMode_t"]               = {"hipblasFillMode_t", CONV_TYPE, API_BLAS};
+    //cuda2hipRename["CUBLAS_FILL_MODE_LOWER"]         = {"HIPBLAS_FILL_MODE_LOWER", CONV_NUMERIC_LITERAL, API_BLAS};
+    //cuda2hipRename["CUBLAS_FILL_MODE_UPPER"]         = {"HIPBLAS_FILL_MODE_UPPER", CONV_NUMERIC_LITERAL, API_BLAS};
     // Blas Diag Types
     // unsupported yet by hipblas/hcblas
-    //cuda2hipRename["cublasDiagType_t"]               = {"hipblasDiagType_t", CONV_BLAS};
-    //cuda2hipRename["CUBLAS_DIAG_NON_UNIT"]           = {"HIPBLAS_DIAG_NON_UNIT", CONV_BLAS};
-    //cuda2hipRename["CUBLAS_DIAG_UNIT"]               = {"HIPBLAS_DIAG_UNIT", CONV_BLAS};
+    //cuda2hipRename["cublasDiagType_t"]               = {"hipblasDiagType_t", CONV_TYPE, API_BLAS};
+    //cuda2hipRename["CUBLAS_DIAG_NON_UNIT"]           = {"HIPBLAS_DIAG_NON_UNIT", CONV_NUMERIC_LITERAL, API_BLAS};
+    //cuda2hipRename["CUBLAS_DIAG_UNIT"]               = {"HIPBLAS_DIAG_UNIT", CONV_NUMERIC_LITERAL, API_BLAS};
     // Blas Side Modes
     // unsupported yet by hipblas/hcblas
-    //cuda2hipRename["cublasSideMode_t"]               = {"hipblasSideMode_t", CONV_BLAS};
-    //cuda2hipRename["CUBLAS_SIDE_LEFT"]               = {"HIPBLAS_SIDE_LEFT", CONV_BLAS};
-    //cuda2hipRename["CUBLAS_SIDE_RIGHT"]              = {"HIPBLAS_SIDE_RIGHT", CONV_BLAS};
+    //cuda2hipRename["cublasSideMode_t"]               = {"hipblasSideMode_t", CONV_TYPE, API_BLAS};
+    //cuda2hipRename["CUBLAS_SIDE_LEFT"]               = {"HIPBLAS_SIDE_LEFT", CONV_NUMERIC_LITERAL, API_BLAS};
+    //cuda2hipRename["CUBLAS_SIDE_RIGHT"]              = {"HIPBLAS_SIDE_RIGHT", CONV_NUMERIC_LITERAL, API_BLAS};
     // Blas Pointer Modes
     // unsupported yet by hipblas/hcblas
-    //cuda2hipRename["cublasPointerMode_t"]            = {"hipblasPointerMode_t", CONV_BLAS};
-    //cuda2hipRename["CUBLAS_POINTER_MODE_HOST"]       = {"HIPBLAS_POINTER_MODE_HOST", CONV_BLAS};
-    //cuda2hipRename["CUBLAS_POINTER_MODE_DEVICE"]     = {"HIPBLAS_POINTER_MODE_DEVICE", CONV_BLAS};
+    //cuda2hipRename["cublasPointerMode_t"]            = {"hipblasPointerMode_t", CONV_TYPE, API_BLAS};
+    //cuda2hipRename["CUBLAS_POINTER_MODE_HOST"]       = {"HIPBLAS_POINTER_MODE_HOST", CONV_NUMERIC_LITERAL, API_BLAS};
+    //cuda2hipRename["CUBLAS_POINTER_MODE_DEVICE"]     = {"HIPBLAS_POINTER_MODE_DEVICE", CONV_NUMERIC_LITERAL, API_BLAS};
     // Blas Atomics Modes
     // unsupported yet by hipblas/hcblas
-    //cuda2hipRename["cublasAtomicsMode_t"]            = {"hipblasAtomicsMode_t", CONV_BLAS};
-    //cuda2hipRename["CUBLAS_ATOMICS_NOT_ALLOWED"]     = {"HIPBLAS_ATOMICS_NOT_ALLOWED", CONV_BLAS};
-    //cuda2hipRename["CUBLAS_ATOMICS_ALLOWED"]         = {"HIPBLAS_ATOMICS_ALLOWED", CONV_BLAS};
+    //cuda2hipRename["cublasAtomicsMode_t"]            = {"hipblasAtomicsMode_t", CONV_TYPE, API_BLAS};
+    //cuda2hipRename["CUBLAS_ATOMICS_NOT_ALLOWED"]     = {"HIPBLAS_ATOMICS_NOT_ALLOWED", CONV_NUMERIC_LITERAL, API_BLAS};
+    //cuda2hipRename["CUBLAS_ATOMICS_ALLOWED"]         = {"HIPBLAS_ATOMICS_ALLOWED", CONV_NUMERIC_LITERAL, API_BLAS};
     // Blas Data Type
     // unsupported yet by hipblas/hcblas
-    //cuda2hipRename["cublasDataType_t"]               = {"hipblasDataType_t", CONV_BLAS};
-    //cuda2hipRename["CUBLAS_DATA_FLOAT"]              = {"HIPBLAS_DATA_FLOAT", CONV_BLAS};
-    //cuda2hipRename["CUBLAS_DATA_DOUBLE"]             = {"HIPBLAS_DATA_DOUBLE", CONV_BLAS};
-    //cuda2hipRename["CUBLAS_DATA_HALF"]               = {"HIPBLAS_DATA_HALF", CONV_BLAS};
-    //cuda2hipRename["CUBLAS_DATA_INT8"]               = {"HIPBLAS_DATA_INT8", CONV_BLAS};
+    //cuda2hipRename["cublasDataType_t"]               = {"hipblasDataType_t", CONV_TYPE, API_BLAS};
+    //cuda2hipRename["CUBLAS_DATA_FLOAT"]              = {"HIPBLAS_DATA_FLOAT", CONV_NUMERIC_LITERAL, API_BLAS};
+    //cuda2hipRename["CUBLAS_DATA_DOUBLE"]             = {"HIPBLAS_DATA_DOUBLE", CONV_NUMERIC_LITERAL, API_BLAS};
+    //cuda2hipRename["CUBLAS_DATA_HALF"]               = {"HIPBLAS_DATA_HALF", CONV_NUMERIC_LITERAL, API_BLAS};
+    //cuda2hipRename["CUBLAS_DATA_INT8"]               = {"HIPBLAS_DATA_INT8", CONV_NUMERIC_LITERAL, API_BLAS};
 
     // Blas1 (v1) Routines
-    cuda2hipRename["cublasCreate"]       = {"hipblasCreate", CONV_BLAS};
-    cuda2hipRename["cublasDestroy"]      = {"hipblasDestroy", CONV_BLAS};
+    cuda2hipRename["cublasCreate"]       = {"hipblasCreate", CONV_MATH_FUNC, API_BLAS};
+    cuda2hipRename["cublasDestroy"]      = {"hipblasDestroy", CONV_MATH_FUNC, API_BLAS};
 
-    cuda2hipRename["cublasSetVector"]    = {"hipblasSetVector", CONV_BLAS};
-    cuda2hipRename["cublasGetVector"]    = {"hipblasGetVector", CONV_BLAS};
-    cuda2hipRename["cublasSetMatrix"]    = {"hipblasSetMatrix", CONV_BLAS};
-    cuda2hipRename["cublasGetMatrix"]    = {"hipblasGetMatrix", CONV_BLAS};
+    cuda2hipRename["cublasSetVector"]    = {"hipblasSetVector", CONV_MATH_FUNC, API_BLAS};
+    cuda2hipRename["cublasGetVector"]    = {"hipblasGetVector", CONV_MATH_FUNC, API_BLAS};
+    cuda2hipRename["cublasSetMatrix"]    = {"hipblasSetMatrix", CONV_MATH_FUNC, API_BLAS};
+    cuda2hipRename["cublasGetMatrix"]    = {"hipblasGetMatrix", CONV_MATH_FUNC, API_BLAS};
 
     // unsupported yet by hipblas/hcblas
-    //cuda2hipRename["cublasGetMatrixAsync"] = {"hipblasGetMatrixAsync", CONV_BLAS};
-    //cuda2hipRename["cublasSetMatrixAsync"] = {"hipblasSetMatrixAsync", CONV_BLAS};
+    //cuda2hipRename["cublasGetMatrixAsync"] = {"hipblasGetMatrixAsync", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasSetMatrixAsync"] = {"hipblasSetMatrixAsync", CONV_MATH_FUNC, API_BLAS};
 
     // NRM2
-    //cuda2hipRename["cublasSnrm2"]  = {"hipblasSnrm2", CONV_BLAS};
-    //cuda2hipRename["cublasDnrm2"]  = {"hipblasDnrm2", CONV_BLAS};
-    //cuda2hipRename["cublasScnrm2"] = {"hipblasScnrm2", CONV_BLAS};
-    //cuda2hipRename["cublasDznrm2"] = {"hipblasDznrm2", CONV_BLAS};
+    //cuda2hipRename["cublasSnrm2"]  = {"hipblasSnrm2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDnrm2"]  = {"hipblasDnrm2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasScnrm2"] = {"hipblasScnrm2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDznrm2"] = {"hipblasDznrm2", CONV_MATH_FUNC, API_BLAS};
 
     // DOT
-    cuda2hipRename["cublasSdot"]        = {"hipblasSdot", CONV_BLAS};
+    cuda2hipRename["cublasSdot"]        = {"hipblasSdot", CONV_MATH_FUNC, API_BLAS};
     // there is no such a function in CUDA
-    cuda2hipRename["cublasSdotBatched"] = {"hipblasSdotBatched",CONV_BLAS};
-    cuda2hipRename["cublasDdot"]        = {"hipblasDdot", CONV_BLAS};
+    cuda2hipRename["cublasSdotBatched"] = {"hipblasSdotBatched",CONV_MATH_FUNC, API_BLAS};
+    cuda2hipRename["cublasDdot"]        = {"hipblasDdot", CONV_MATH_FUNC, API_BLAS};
     // there is no such a function in CUDA
-    cuda2hipRename["cublasDdotBatched"] = {"hipblasDdotBatched", CONV_BLAS};
-    //cuda2hipRename["cublasCdotu"] = {"hipblasCdotu", CONV_BLAS};
-    //cuda2hipRename["cublasCdotc"] = {"hipblasCdotc", CONV_BLAS};
-    //cuda2hipRename["cublasZdotu"] = {"hipblasZdotu", CONV_BLAS};
-    //cuda2hipRename["cublasZdotc"] = {"hipblasZdotc", CONV_BLAS};
+    cuda2hipRename["cublasDdotBatched"] = {"hipblasDdotBatched", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCdotu"] = {"hipblasCdotu", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCdotc"] = {"hipblasCdotc", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZdotu"] = {"hipblasZdotu", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZdotc"] = {"hipblasZdotc", CONV_MATH_FUNC, API_BLAS};
 
     // SCAL
-    cuda2hipRename["cublasSscal"] = {"hipblasSscal", CONV_BLAS};
+    cuda2hipRename["cublasSscal"] = {"hipblasSscal", CONV_MATH_FUNC, API_BLAS};
     // there is no such a function in CUDA
-    cuda2hipRename["cublasSscalBatched"] = {"hipblasSscalBatched", CONV_BLAS};
-    cuda2hipRename["cublasDscal"] = {"hipblasDscal", CONV_BLAS};
+    cuda2hipRename["cublasSscalBatched"] = {"hipblasSscalBatched", CONV_MATH_FUNC, API_BLAS};
+    cuda2hipRename["cublasDscal"] = {"hipblasDscal", CONV_MATH_FUNC, API_BLAS};
     // there is no such a function in CUDA
-    cuda2hipRename["cublasDscalBatched"] = {"hipblasDscalBatched", CONV_BLAS};
-    //cuda2hipRename["cublasCscal"]  = {"hipblasCscal", CONV_BLAS};
-    //cuda2hipRename["cublasCsscal"] = {"hipblasCsscal", CONV_BLAS};
-    //cuda2hipRename["cublasZscal"]  = {"hipblasZscal", CONV_BLAS};
-    //cuda2hipRename["cublasZdscal"] = {"hipblasZdscal", CONV_BLAS};
+    cuda2hipRename["cublasDscalBatched"] = {"hipblasDscalBatched", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCscal"]  = {"hipblasCscal", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCsscal"] = {"hipblasCsscal", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZscal"]  = {"hipblasZscal", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZdscal"] = {"hipblasZdscal", CONV_MATH_FUNC, API_BLAS};
 
     // AXPY
-    cuda2hipRename["cublasSaxpy"] = {"hipblasSaxpy", CONV_BLAS};
+    cuda2hipRename["cublasSaxpy"] = {"hipblasSaxpy", CONV_MATH_FUNC, API_BLAS};
     // there is no such a function in CUDA
-    cuda2hipRename["cublasSaxpyBatched"] = {"hipblasSaxpyBatched", CONV_BLAS};
-    //cuda2hipRename["cublasDaxpy"] = {"hipblasDaxpy", CONV_BLAS};
-    //cuda2hipRename["cublasCaxpy"] = {"hipblasCaxpy", CONV_BLAS};
-    //cuda2hipRename["cublasZaxpy"] = {"hipblasZaxpy", CONV_BLAS};
+    cuda2hipRename["cublasSaxpyBatched"] = {"hipblasSaxpyBatched", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDaxpy"] = {"hipblasDaxpy", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCaxpy"] = {"hipblasCaxpy", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZaxpy"] = {"hipblasZaxpy", CONV_MATH_FUNC, API_BLAS};
 
     // COPY
-    cuda2hipRename["cublasScopy"] = {"hipblasScopy", CONV_BLAS};
+    cuda2hipRename["cublasScopy"] = {"hipblasScopy", CONV_MATH_FUNC, API_BLAS};
     // there is no such a function in CUDA
-    cuda2hipRename["cublasScopyBatched"] = {"hipblasScopyBatched", CONV_BLAS};
-    cuda2hipRename["cublasDcopy"] = {"hipblasDcopy", CONV_BLAS};
+    cuda2hipRename["cublasScopyBatched"] = {"hipblasScopyBatched", CONV_MATH_FUNC, API_BLAS};
+    cuda2hipRename["cublasDcopy"] = {"hipblasDcopy", CONV_MATH_FUNC, API_BLAS};
     // there is no such a function in CUDA
-    cuda2hipRename["cublasDcopyBatched"] = {"hipblasDcopyBatched", CONV_BLAS};
-    //cuda2hipRename["cublasCcopy"] = {"hipblasCcopy", CONV_BLAS};
-    //cuda2hipRename["cublasZcopy"] = {"hipblasZcopy", CONV_BLAS};
+    cuda2hipRename["cublasDcopyBatched"] = {"hipblasDcopyBatched", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCcopy"] = {"hipblasCcopy", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZcopy"] = {"hipblasZcopy", CONV_MATH_FUNC, API_BLAS};
 
     // SWAP
-    //cuda2hipRename["cublasSswap"] = {"hipblasSswap", CONV_BLAS};
-    //cuda2hipRename["cublasDswap"] = {"hipblasDswap", CONV_BLAS};
-    //cuda2hipRename["cublasCswap"] = {"hipblasCswap", CONV_BLAS};
-    //cuda2hipRename["cublasZswap"] = {"hipblasZswap", CONV_BLAS};
+    //cuda2hipRename["cublasSswap"] = {"hipblasSswap", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDswap"] = {"hipblasDswap", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCswap"] = {"hipblasCswap", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZswap"] = {"hipblasZswap", CONV_MATH_FUNC, API_BLAS};
 
     // AMAX
-    //cuda2hipRename["cublasIsamax"] = {"hipblasIsamax", CONV_BLAS};
-    //cuda2hipRename["cublasIdamax"] = {"hipblasIdamax", CONV_BLAS};
-    //cuda2hipRename["cublasIcamax"] = {"hipblasIcamax", CONV_BLAS};
-    //cuda2hipRename["cublasIzamax"] = {"hipblasIzamax", CONV_BLAS};
+    //cuda2hipRename["cublasIsamax"] = {"hipblasIsamax", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasIdamax"] = {"hipblasIdamax", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasIcamax"] = {"hipblasIcamax", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasIzamax"] = {"hipblasIzamax", CONV_MATH_FUNC, API_BLAS};
 
     // AMIN
-    //cuda2hipRename["cublasIsamin"] = {"hipblasIsamin", CONV_BLAS};
-    //cuda2hipRename["cublasIdamin"] = {"hipblasIdamin", CONV_BLAS};
-    //cuda2hipRename["cublasIcamin"] = {"hipblasIcamin", CONV_BLAS};
-    //cuda2hipRename["cublasIzamin"] = {"hipblasIzamin", CONV_BLAS};
+    //cuda2hipRename["cublasIsamin"] = {"hipblasIsamin", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasIdamin"] = {"hipblasIdamin", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasIcamin"] = {"hipblasIcamin", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasIzamin"] = {"hipblasIzamin", CONV_MATH_FUNC, API_BLAS};
 
     // ASUM
-    cuda2hipRename["cublasSasum"]        = {"hipblasSasum", CONV_BLAS};
+    cuda2hipRename["cublasSasum"]        = {"hipblasSasum", CONV_MATH_FUNC, API_BLAS};
     // there is no such a function in CUDA
-    cuda2hipRename["cublasSasumBatched"] = {"hipblasSasumBatched", CONV_BLAS};
-    cuda2hipRename["cublasDasum"]        = {"hipblasDasum", CONV_BLAS};
+    cuda2hipRename["cublasSasumBatched"] = {"hipblasSasumBatched", CONV_MATH_FUNC, API_BLAS};
+    cuda2hipRename["cublasDasum"]        = {"hipblasDasum", CONV_MATH_FUNC, API_BLAS};
     // there is no such a function in CUDA
-    cuda2hipRename["cublasDasumBatched"] = {"hipblasDasumBatched", CONV_BLAS};
-    //cuda2hipRename["cublasScasum"] = {"hipblasScasum", CONV_BLAS};
-    //cuda2hipRename["cublasDzasum"] = {"hipblasDzasum", CONV_BLAS};
+    cuda2hipRename["cublasDasumBatched"] = {"hipblasDasumBatched", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasScasum"] = {"hipblasScasum", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDzasum"] = {"hipblasDzasum", CONV_MATH_FUNC, API_BLAS};
 
     // ROT
-    //cuda2hipRename["cublasSrot"]  = {"hipblasSrot", CONV_BLAS};
-    //cuda2hipRename["cublasDrot"]  = {"hipblasDrot", CONV_BLAS};
-    //cuda2hipRename["cublasCrot"]  = {"hipblasCrot", CONV_BLAS};
-    //cuda2hipRename["cublasCsrot"] = {"hipblasCsrot", CONV_BLAS};
-    //cuda2hipRename["cublasZrot"]  = {"hipblasZrot", CONV_BLAS};
-    //cuda2hipRename["cublasZdrot"] = {"hipblasZdrot", CONV_BLAS};
+    //cuda2hipRename["cublasSrot"]  = {"hipblasSrot", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDrot"]  = {"hipblasDrot", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCrot"]  = {"hipblasCrot", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCsrot"] = {"hipblasCsrot", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZrot"]  = {"hipblasZrot", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZdrot"] = {"hipblasZdrot", CONV_MATH_FUNC, API_BLAS};
 
     // ROTG
-    //cuda2hipRename["cublasSrotg"] = {"hipblasSrotg", CONV_BLAS};
-    //cuda2hipRename["cublasDrotg"] = {"hipblasDrotg", CONV_BLAS};
-    //cuda2hipRename["cublasCrotg"] = {"hipblasCrotg", CONV_BLAS};
-    //cuda2hipRename["cublasZrotg"] = {"hipblasZrotg", CONV_BLAS};
+    //cuda2hipRename["cublasSrotg"] = {"hipblasSrotg", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDrotg"] = {"hipblasDrotg", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCrotg"] = {"hipblasCrotg", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZrotg"] = {"hipblasZrotg", CONV_MATH_FUNC, API_BLAS};
 
     // ROTM
-    //cuda2hipRename["cublasSrotm"] = {"hipblasSrotm", CONV_BLAS};
-    //cuda2hipRename["cublasDrotm"] = {"hipblasDrotm", CONV_BLAS};
+    //cuda2hipRename["cublasSrotm"] = {"hipblasSrotm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDrotm"] = {"hipblasDrotm", CONV_MATH_FUNC, API_BLAS};
 
     // ROTMG
-    //cuda2hipRename["cublasSrotmg"] = {"hipblasSrotmg", CONV_BLAS};
-    //cuda2hipRename["cublasDrotmg"] = {"hipblasDrotmg", CONV_BLAS};
+    //cuda2hipRename["cublasSrotmg"] = {"hipblasSrotmg", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDrotmg"] = {"hipblasDrotmg", CONV_MATH_FUNC, API_BLAS};
 
     // GEMV
-    cuda2hipRename["cublasSgemv"] = {"hipblasSgemv", CONV_BLAS};
+    cuda2hipRename["cublasSgemv"] = {"hipblasSgemv", CONV_MATH_FUNC, API_BLAS};
     // there is no such a function in CUDA
-    cuda2hipRename["cublasSgemvBatched"] = {"hipblasSgemvBatched", CONV_BLAS};
-    //cuda2hipRename["cublasDgemv"] = {"hipblasDgemv", CONV_BLAS};
-    //cuda2hipRename["cublasCgemv"] = {"hipblasCgemv", CONV_BLAS};
-    //cuda2hipRename["cublasZgemv"] = {"hipblasZgemv", CONV_BLAS};
+    cuda2hipRename["cublasSgemvBatched"] = {"hipblasSgemvBatched", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDgemv"] = {"hipblasDgemv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCgemv"] = {"hipblasCgemv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZgemv"] = {"hipblasZgemv", CONV_MATH_FUNC, API_BLAS};
 
     // GBMV
-    //cuda2hipRename["cublasSgbmv"] = {"hipblasSgbmv", CONV_BLAS};
-    //cuda2hipRename["cublasDgbmv"] = {"hipblasDgbmv", CONV_BLAS};
-    //cuda2hipRename["cublasCgbmv"] = {"hipblasCgbmv", CONV_BLAS};
-    //cuda2hipRename["cublasZgbmv"] = {"hipblasZgbmv", CONV_BLAS};
+    //cuda2hipRename["cublasSgbmv"] = {"hipblasSgbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDgbmv"] = {"hipblasDgbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCgbmv"] = {"hipblasCgbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZgbmv"] = {"hipblasZgbmv", CONV_MATH_FUNC, API_BLAS};
 
     // TRMV
-    //cuda2hipRename["cublasStrmv"] = {"hipblasStrmv", CONV_BLAS};
-    //cuda2hipRename["cublasDtrmv"] = {"hipblasDtrmv", CONV_BLAS};
-    //cuda2hipRename["cublasCtrmv"] = {"hipblasCtrmv", CONV_BLAS};
-    //cuda2hipRename["cublasZtrmv"] = {"hipblasZtrmv", CONV_BLAS};
+    //cuda2hipRename["cublasStrmv"] = {"hipblasStrmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDtrmv"] = {"hipblasDtrmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCtrmv"] = {"hipblasCtrmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZtrmv"] = {"hipblasZtrmv", CONV_MATH_FUNC, API_BLAS};
 
     // TBMV
-    //cuda2hipRename["cublasStbmv"] = {"hipblasStbmv", CONV_BLAS};
-    //cuda2hipRename["cublasDtbmv"] = {"hipblasDtbmv", CONV_BLAS};
-    //cuda2hipRename["cublasCtbmv"] = {"hipblasCtbmv", CONV_BLAS};
-    //cuda2hipRename["cublasZtbmv"] = {"hipblasZtbmv", CONV_BLAS};
+    //cuda2hipRename["cublasStbmv"] = {"hipblasStbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDtbmv"] = {"hipblasDtbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCtbmv"] = {"hipblasCtbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZtbmv"] = {"hipblasZtbmv", CONV_MATH_FUNC, API_BLAS};
 
     // TPMV
-    //cuda2hipRename["cublasStpmv"] = {"hipblasStpmv", CONV_BLAS};
-    //cuda2hipRename["cublasDtpmv"] = {"hipblasDtpmv", CONV_BLAS};
-    //cuda2hipRename["cublasCtpmv"] = {"hipblasCtpmv", CONV_BLAS};
-    //cuda2hipRename["cublasZtpmv"] = {"hipblasZtpmv", CONV_BLAS};
+    //cuda2hipRename["cublasStpmv"] = {"hipblasStpmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDtpmv"] = {"hipblasDtpmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCtpmv"] = {"hipblasCtpmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZtpmv"] = {"hipblasZtpmv", CONV_MATH_FUNC, API_BLAS};
 
     // TRSV
-    //cuda2hipRename["cublasStrsv"] = {"hipblasStrsv", CONV_BLAS};
-    //cuda2hipRename["cublasDtrsv"] = {"hipblasDtrsv", CONV_BLAS};
-    //cuda2hipRename["cublasCtrsv"] = {"hipblasCtrsv", CONV_BLAS};
-    //cuda2hipRename["cublasZtrsv"] = {"hipblasZtrsv", CONV_BLAS};
+    //cuda2hipRename["cublasStrsv"] = {"hipblasStrsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDtrsv"] = {"hipblasDtrsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCtrsv"] = {"hipblasCtrsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZtrsv"] = {"hipblasZtrsv", CONV_MATH_FUNC, API_BLAS};
 
     // TPSV
-    //cuda2hipRename["cublasStpsv"] = {"hipblasStpsv", CONV_BLAS};
-    //cuda2hipRename["cublasDtpsv"] = {"hipblasDtpsv", CONV_BLAS};
-    //cuda2hipRename["cublasCtpsv"] = {"hipblasCtpsv", CONV_BLAS};
-    //cuda2hipRename["cublasZtpsv"] = {"hipblasZtpsv", CONV_BLAS};
+    //cuda2hipRename["cublasStpsv"] = {"hipblasStpsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDtpsv"] = {"hipblasDtpsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCtpsv"] = {"hipblasCtpsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZtpsv"] = {"hipblasZtpsv", CONV_MATH_FUNC, API_BLAS};
 
     // TBSV
-    //cuda2hipRename["cublasStbsv"] = {"hipblasStbsv", CONV_BLAS};
-    //cuda2hipRename["cublasDtbsv"] = {"hipblasDtbsv", CONV_BLAS};
-    //cuda2hipRename["cublasCtbsv"] = {"hipblasCtbsv", CONV_BLAS};
-    //cuda2hipRename["cublasZtbsv"] = {"hipblasZtbsv", CONV_BLAS};
+    //cuda2hipRename["cublasStbsv"] = {"hipblasStbsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDtbsv"] = {"hipblasDtbsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCtbsv"] = {"hipblasCtbsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZtbsv"] = {"hipblasZtbsv", CONV_MATH_FUNC, API_BLAS};
 
     // SYMV/HEMV
-    //cuda2hipRename["cublasSsymv"] = {"hipblasSsymv", CONV_BLAS};
-    //cuda2hipRename["cublasDsymv"] = {"hipblasDsymv", CONV_BLAS};
-    //cuda2hipRename["cublasCsymv"] = {"hipblasCsymv", CONV_BLAS};
-    //cuda2hipRename["cublasZsymv"] = {"hipblasZsymv", CONV_BLAS};
-    //cuda2hipRename["cublasChemv"] = {"hipblasChemv", CONV_BLAS};
-    //cuda2hipRename["cublasZhemv"] = {"hipblasZhemv", CONV_BLAS};
+    //cuda2hipRename["cublasSsymv"] = {"hipblasSsymv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDsymv"] = {"hipblasDsymv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCsymv"] = {"hipblasCsymv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZsymv"] = {"hipblasZsymv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasChemv"] = {"hipblasChemv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZhemv"] = {"hipblasZhemv", CONV_MATH_FUNC, API_BLAS};
 
     // SBMV/HBMV
-    //cuda2hipRename["cublasSsbmv"] = {"hipblasSsbmv", CONV_BLAS};
-    //cuda2hipRename["cublasDsbmv"] = {"hpiblasDsbmv", CONV_BLAS};
-    //cuda2hipRename["cublasChbmv"] = {"hipblasChbmv", CONV_BLAS};
-    //cuda2hipRename["cublasZhbmv"] = {"hipblasZhbmv", CONV_BLAS};
+    //cuda2hipRename["cublasSsbmv"] = {"hipblasSsbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDsbmv"] = {"hpiblasDsbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasChbmv"] = {"hipblasChbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZhbmv"] = {"hipblasZhbmv", CONV_MATH_FUNC, API_BLAS};
 
     // SPMV/HPMV
-    //cuda2hipRename["cublasSspmv"] = {"hipblasSspmv", CONV_BLAS};
-    //cuda2hipRename["cublasDspmv"] = {"hipblasDspmv", CONV_BLAS};
-    //cuda2hipRename["cublasChpmv"] = {"hipblasChpmv", CONV_BLAS};
-    //cuda2hipRename["cublasZhpmv"] = {"hipblasZhpmv", CONV_BLAS};
+    //cuda2hipRename["cublasSspmv"] = {"hipblasSspmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDspmv"] = {"hipblasDspmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasChpmv"] = {"hipblasChpmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZhpmv"] = {"hipblasZhpmv", CONV_MATH_FUNC, API_BLAS};
 
     // GER
-    cuda2hipRename["cublasSger"] = {"hipblasSger", CONV_BLAS};
-    //cuda2hipRename["cublasDger"]  = {"hipblasDger", CONV_BLAS};
-    //cuda2hipRename["cublasCgeru"] = {"hipblasCgeru", CONV_BLAS};
-    //cuda2hipRename["cublasCgerc"] = {"hipblasCgerc", CONV_BLAS};
-    //cuda2hipRename["cublasZgeru"] = {"hipblasZgeru", CONV_BLAS};
-    //cuda2hipRename["cublasZgerc"] = {"hipblasZgerc", CONV_BLAS};
+    cuda2hipRename["cublasSger"] = {"hipblasSger", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDger"]  = {"hipblasDger", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCgeru"] = {"hipblasCgeru", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCgerc"] = {"hipblasCgerc", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZgeru"] = {"hipblasZgeru", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZgerc"] = {"hipblasZgerc", CONV_MATH_FUNC, API_BLAS};
 
     // SYR/HER
-    //cuda2hipRename["cublasSsyr"] = {"hipblasSsyr", CONV_BLAS};
-    //cuda2hipRename["cublasDsyr"] = {"hipblasDsyr", CONV_BLAS};
-    //cuda2hipRename["cublasCher"] = {"hipblasCher", CONV_BLAS};
-    //cuda2hipRename["cublasZher"] = {"hipblasZher", CONV_BLAS};
+    //cuda2hipRename["cublasSsyr"] = {"hipblasSsyr", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDsyr"] = {"hipblasDsyr", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCher"] = {"hipblasCher", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZher"] = {"hipblasZher", CONV_MATH_FUNC, API_BLAS};
 
     // SPR/HPR
-    //cuda2hipRename["cublasSspr"] = {"hipblasSspr", CONV_BLAS};
-    //cuda2hipRename["cublasDspr"] = {"hipblasDspr", CONV_BLAS};
-    //cuda2hipRename["cublasChpr"] = {"hipblasChpr", CONV_BLAS};
-    //cuda2hipRename["cublasZhpr"] = {"hipblasZhpr", CONV_BLAS};
+    //cuda2hipRename["cublasSspr"] = {"hipblasSspr", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDspr"] = {"hipblasDspr", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasChpr"] = {"hipblasChpr", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZhpr"] = {"hipblasZhpr", CONV_MATH_FUNC, API_BLAS};
 
     // SYR2/HER2
-    //cuda2hipRename["cublasSsyr2"] = {"hipblasSsyr2", CONV_BLAS};
-    //cuda2hipRename["cublasDsyr2"] = {"hipblasDsyr2", CONV_BLAS};
-    //cuda2hipRename["cublasCher2"] = {"hipblasCher2", CONV_BLAS};
-    //cuda2hipRename["cublasZher2"] = {"hipblasZher2", CONV_BLAS};
+    //cuda2hipRename["cublasSsyr2"] = {"hipblasSsyr2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDsyr2"] = {"hipblasDsyr2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCher2"] = {"hipblasCher2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZher2"] = {"hipblasZher2", CONV_MATH_FUNC, API_BLAS};
 
     // SPR2/HPR2
-    //cuda2hipRename["cublasSspr2"] = {"hipblasSspr2", CONV_BLAS};
-    //cuda2hipRename["cublasDspr2"] = {"hipblasDspr2", CONV_BLAS};
-    //cuda2hipRename["cublasChpr2"] = {"hipblasChpr2", CONV_BLAS};
-    //cuda2hipRename["cublasZhpr2"] = {"hipblasZhpr2", CONV_BLAS};
+    //cuda2hipRename["cublasSspr2"] = {"hipblasSspr2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDspr2"] = {"hipblasDspr2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasChpr2"] = {"hipblasChpr2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZhpr2"] = {"hipblasZhpr2", CONV_MATH_FUNC, API_BLAS};
 
     // Blas3 (v1) Routines
     // GEMM
-    cuda2hipRename["cublasSgemm"] = {"hipblasSgemm", CONV_BLAS};
-    //cuda2hipRename["cublasDgemm"] = {"hipblasDgemm", CONV_BLAS};
-    cuda2hipRename["cublasCgemm"] = {"hipblasCgemm", CONV_BLAS};
-    //cuda2hipRename["cublasZgemm"] = {"hipblasZgemm", CONV_BLAS};
+    cuda2hipRename["cublasSgemm"] = {"hipblasSgemm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDgemm"] = {"hipblasDgemm", CONV_MATH_FUNC, API_BLAS};
+    cuda2hipRename["cublasCgemm"] = {"hipblasCgemm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZgemm"] = {"hipblasZgemm", CONV_MATH_FUNC, API_BLAS};
 
     // BATCH GEMM
-    cuda2hipRename["cublasSgemmBatched"] = {"hipblasSgemmBatched", CONV_BLAS};
-    //cuda2hipRename["cublasDgemmBatched"] = {"hipblasDgemmBatched", CONV_BLAS};
-    cuda2hipRename["cublasCgemmBatched"] = {"hipblasCgemmBatched", CONV_BLAS};
-    //cuda2hipRename["cublasZgemmBatched"] = {"hipblasZgemmBatched", CONV_BLAS};
+    cuda2hipRename["cublasSgemmBatched"] = {"hipblasSgemmBatched", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDgemmBatched"] = {"hipblasDgemmBatched", CONV_MATH_FUNC, API_BLAS};
+    cuda2hipRename["cublasCgemmBatched"] = {"hipblasCgemmBatched", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZgemmBatched"] = {"hipblasZgemmBatched", CONV_MATH_FUNC, API_BLAS};
 
     // SYRK
-    //cuda2hipRename["cublasSsyrk"] = {"hipblasSsyrk", CONV_BLAS};
-    //cuda2hipRename["cublasDsyrk"] = {"hipblasDsyrk", CONV_BLAS};
-    //cuda2hipRename["cublasCsyrk"] = {"hipblasCsyrk", CONV_BLAS};
-    //cuda2hipRename["cublasZsyrk"] = {"hipblasZsyrk", CONV_BLAS};
+    //cuda2hipRename["cublasSsyrk"] = {"hipblasSsyrk", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDsyrk"] = {"hipblasDsyrk", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCsyrk"] = {"hipblasCsyrk", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZsyrk"] = {"hipblasZsyrk", CONV_MATH_FUNC, API_BLAS};
 
     // HERK
-    //cuda2hipRename["cublasCherk"] = {"hipblasCherk", CONV_BLAS};
-    //cuda2hipRename["cublasZherk"] = {"hipblasZherk", CONV_BLAS};
+    //cuda2hipRename["cublasCherk"] = {"hipblasCherk", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZherk"] = {"hipblasZherk", CONV_MATH_FUNC, API_BLAS};
 
     // SYR2K
-    //cuda2hipRename["cublasSsyr2k"] = {"hipblasSsyr2k", CONV_BLAS};
-    //cuda2hipRename["cublasDsyr2k"] = {"hipblasDsyr2k", CONV_BLAS};
-    //cuda2hipRename["cublasCsyr2k"] = {"hipblasCsyr2k", CONV_BLAS};
-    //cuda2hipRename["cublasZsyr2k"] = {"hipblasZsyr2k", CONV_BLAS};
+    //cuda2hipRename["cublasSsyr2k"] = {"hipblasSsyr2k", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDsyr2k"] = {"hipblasDsyr2k", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCsyr2k"] = {"hipblasCsyr2k", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZsyr2k"] = {"hipblasZsyr2k", CONV_MATH_FUNC, API_BLAS};
 
     // SYRKX - eXtended SYRK
     // cublasSsyrkx
@@ -652,40 +663,40 @@ struct cuda2hipMap {
     // cublasZsyrkx
 
     // HER2K
-    //cuda2hipRename["cublasCher2k"] = {"hipblasCher2k", CONV_BLAS};
-    //cuda2hipRename["cublasZher2k"] = {"hipblasZher2k", CONV_BLAS};
+    //cuda2hipRename["cublasCher2k"] = {"hipblasCher2k", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZher2k"] = {"hipblasZher2k", CONV_MATH_FUNC, API_BLAS};
 
     // HERKX - eXtended HERK
     // cublasCherkx
     // cublasZherkx
 
     // SYMM
-    //cuda2hipRename["cublasSsymm"] = {"hipblasSsymm", CONV_BLAS};
-    //cuda2hipRename["cublasDsymm"] = {"hipblasDsymm", CONV_BLAS};
-    //cuda2hipRename["cublasCsymm"] = {"hipblasCsymm", CONV_BLAS};
-    //cuda2hipRename["cublasZsymm"] = {"hipblasZsymm", CONV_BLAS};
+    //cuda2hipRename["cublasSsymm"] = {"hipblasSsymm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDsymm"] = {"hipblasDsymm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCsymm"] = {"hipblasCsymm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZsymm"] = {"hipblasZsymm", CONV_MATH_FUNC, API_BLAS};
 
     // HEMM
-    //cuda2hipRename["cublasChemm"] = {"hipblasChemm", CONV_BLAS};
-    //cuda2hipRename["cublasZhemm"] = {"hipblasZhemm", CONV_BLAS};
+    //cuda2hipRename["cublasChemm"] = {"hipblasChemm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZhemm"] = {"hipblasZhemm", CONV_MATH_FUNC, API_BLAS};
 
     // TRSM
-    //cuda2hipRename["cublasStrsm"] = {"hipblasStrsm", CONV_BLAS};
-    //cuda2hipRename["cublasDtrsm"] = {"hipblasDtrsm", CONV_BLAS};
-    //cuda2hipRename["cublasCtrsm"] = {"hipblasCtrsm", CONV_BLAS};
-    //cuda2hipRename["cublasZtrsm"] = {"hipblasZtrsm", CONV_BLAS};
+    //cuda2hipRename["cublasStrsm"] = {"hipblasStrsm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDtrsm"] = {"hipblasDtrsm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCtrsm"] = {"hipblasCtrsm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZtrsm"] = {"hipblasZtrsm", CONV_MATH_FUNC, API_BLAS};
 
     // TRSM - Batched Triangular Solver
-    //cuda2hipRename["cublasStrsmBatched"] = {"hipblasStrsmBatched", CONV_BLAS};
-    //cuda2hipRename["cublasDtrsmBatched"] = {"hipblasDtrsmBatched", CONV_BLAS};
-    //cuda2hipRename["cublasCtrsmBatched"] = {"hipblasCtrsmBatched", CONV_BLAS};
-    //cuda2hipRename["cublasZtrsmBatched"] = {"hipblasZtrsmBatched", CONV_BLAS};
+    //cuda2hipRename["cublasStrsmBatched"] = {"hipblasStrsmBatched", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDtrsmBatched"] = {"hipblasDtrsmBatched", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCtrsmBatched"] = {"hipblasCtrsmBatched", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZtrsmBatched"] = {"hipblasZtrsmBatched", CONV_MATH_FUNC, API_BLAS};
 
     // TRMM
-    //cuda2hipRename["cublasStrmm"] = {"hipblasStrmm", CONV_BLAS};
-    //cuda2hipRename["cublasDtrmm"] = {"hipblasDtrmm", CONV_BLAS};
-    //cuda2hipRename["cublasCtrmm"] = {"hipblasCtrmm", CONV_BLAS};
-    //cuda2hipRename["cublasZtrmm"] = {"hipblasZtrmm", CONV_BLAS};
+    //cuda2hipRename["cublasStrmm"] = {"hipblasStrmm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDtrmm"] = {"hipblasDtrmm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCtrmm"] = {"hipblasCtrmm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZtrmm"] = {"hipblasZtrmm", CONV_MATH_FUNC, API_BLAS};
 
 
     // TO SUPPORT OR NOT? (cublas_api.h)
@@ -758,256 +769,257 @@ struct cuda2hipMap {
     // cublasZtrttp
 
     // Blas2 (v2) Routines
-    cuda2hipRename["cublasCreate_v2"] =  {"hipblasCreate", CONV_BLAS};
-    cuda2hipRename["cublasDestroy_v2"] = {"hipblasDestroy", CONV_BLAS};
+    cuda2hipRename["cublasCreate_v2"] =  {"hipblasCreate", CONV_MATH_FUNC, API_BLAS};
+    cuda2hipRename["cublasDestroy_v2"] = {"hipblasDestroy", CONV_MATH_FUNC, API_BLAS};
 
     // unsupported yet by hipblas/hcblas
-    //cuda2hipRename["cublasGetVersion_v2"]     = {"hipblasGetVersion", CONV_BLAS};
-    //cuda2hipRename["cublasSetStream_v2"]      = {"hipblasSetStream", CONV_BLAS};
-    //cuda2hipRename["cublasGetStream_v2"]      = {"hipblasGetStream", CONV_BLAS};
-    //cuda2hipRename["cublasGetPointerMode_v2"] = {"hipblasGetPointerMode", CONV_BLAS};
-    //cuda2hipRename["cublasSetPointerMode_v2"] = {"hipblasSetPointerMode", CONV_BLAS};
+    //cuda2hipRename["cublasGetVersion_v2"]     = {"hipblasGetVersion", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasSetStream_v2"]      = {"hipblasSetStream", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasGetStream_v2"]      = {"hipblasGetStream", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasGetPointerMode_v2"] = {"hipblasGetPointerMode", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasSetPointerMode_v2"] = {"hipblasSetPointerMode", CONV_MATH_FUNC, API_BLAS};
 
     // GEMV
-    cuda2hipRename["cublasSgemv_v2"] = {"hipblasSgemv", CONV_BLAS};
-    //cuda2hipRename["cublasDgemv_v2"] = {"hipblasDgemv", CONV_BLAS};
-    //cuda2hipRename["cublasCgemv_v2"] = {"hipblasCgemv", CONV_BLAS};
-    //cuda2hipRename["cublasZgemv_v2"] = {"hipblasZgemv", CONV_BLAS};
+    cuda2hipRename["cublasSgemv_v2"] = {"hipblasSgemv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDgemv_v2"] = {"hipblasDgemv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCgemv_v2"] = {"hipblasCgemv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZgemv_v2"] = {"hipblasZgemv", CONV_MATH_FUNC, API_BLAS};
 
     // GBMV
-    //cuda2hipRename["cublasSgbmv_v2"] = {"hipblasSgbmv", CONV_BLAS};
-    //cuda2hipRename["cublasDgbmv_v2"] = {"hipblasDgbmv", CONV_BLAS};
-    //cuda2hipRename["cublasCgbmv_v2"] = {"hipblasCgbmv", CONV_BLAS};
-    //cuda2hipRename["cublasZgbmv_v2"] = {"hipblasZgbmv", CONV_BLAS};
+    //cuda2hipRename["cublasSgbmv_v2"] = {"hipblasSgbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDgbmv_v2"] = {"hipblasDgbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCgbmv_v2"] = {"hipblasCgbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZgbmv_v2"] = {"hipblasZgbmv", CONV_MATH_FUNC, API_BLAS};
 
     // TRMV
-    //cuda2hipRename["cublasStrmv_v2"] = {"hipblasStrmv", CONV_BLAS};
-    //cuda2hipRename["cublasDtrmv_v2"] = {"hipblasDtrmv", CONV_BLAS};
-    //cuda2hipRename["cublasCtrmv_v2"] = {"hipblasCtrmv", CONV_BLAS};
-    //cuda2hipRename["cublasZtrmv_v2"] = {"hipblasZtrmv", CONV_BLAS};
+    //cuda2hipRename["cublasStrmv_v2"] = {"hipblasStrmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDtrmv_v2"] = {"hipblasDtrmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCtrmv_v2"] = {"hipblasCtrmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZtrmv_v2"] = {"hipblasZtrmv", CONV_MATH_FUNC, API_BLAS};
 
     // TBMV
-    //cuda2hipRename["cublasStbmv_v2"] = {"hipblasStbmv", CONV_BLAS};
-    //cuda2hipRename["cublasDtbmv_v2"] = {"hipblasDtbmv", CONV_BLAS};
-    //cuda2hipRename["cublasCtbmv_v2"] = {"hipblasCtbmv", CONV_BLAS};
-    //cuda2hipRename["cublasZtbmv_v2"] = {"hipblasZtbmv", CONV_BLAS};
+    //cuda2hipRename["cublasStbmv_v2"] = {"hipblasStbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDtbmv_v2"] = {"hipblasDtbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCtbmv_v2"] = {"hipblasCtbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZtbmv_v2"] = {"hipblasZtbmv", CONV_MATH_FUNC, API_BLAS};
 
     // TPMV
-    //cuda2hipRename["cublasStpmv_v2"] = {"hipblasStpmv", CONV_BLAS};
-    //cuda2hipRename["cublasDtpmv_v2"] = {"hipblasDtpmv", CONV_BLAS};
-    //cuda2hipRename["cublasCtpmv_v2"] = {"hipblasCtpmv", CONV_BLAS};
-    //cuda2hipRename["cublasZtpmv_v2"] = {"hipblasZtpmv", CONV_BLAS};
+    //cuda2hipRename["cublasStpmv_v2"] = {"hipblasStpmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDtpmv_v2"] = {"hipblasDtpmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCtpmv_v2"] = {"hipblasCtpmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZtpmv_v2"] = {"hipblasZtpmv", CONV_MATH_FUNC, API_BLAS};
 
     // TRSV
-    //cuda2hipRename["cublasStrsv_v2"] = {"hipblasStrsv", CONV_BLAS};
-    //cuda2hipRename["cublasDtrsv_v2"] = {"hipblasDtrsv", CONV_BLAS};
-    //cuda2hipRename["cublasCtrsv_v2"] = {"hipblasCtrsv", CONV_BLAS};
-    //cuda2hipRename["cublasZtrsv_v2"] = {"hipblasZtrsv", CONV_BLAS};
+    //cuda2hipRename["cublasStrsv_v2"] = {"hipblasStrsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDtrsv_v2"] = {"hipblasDtrsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCtrsv_v2"] = {"hipblasCtrsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZtrsv_v2"] = {"hipblasZtrsv", CONV_MATH_FUNC, API_BLAS};
 
     // TPSV
-    //cuda2hipRename["cublasStpsv_v2"] = {"hipblasStpsv", CONV_BLAS};
-    //cuda2hipRename["cublasDtpsv_v2"] = {"hipblasDtpsv", CONV_BLAS};
-    //cuda2hipRename["cublasCtpsv_v2"] = {"hipblasCtpsv", CONV_BLAS};
-    //cuda2hipRename["cublasZtpsv_v2"] = {"hipblasZtpsv", CONV_BLAS};
+    //cuda2hipRename["cublasStpsv_v2"] = {"hipblasStpsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDtpsv_v2"] = {"hipblasDtpsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCtpsv_v2"] = {"hipblasCtpsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZtpsv_v2"] = {"hipblasZtpsv", CONV_MATH_FUNC, API_BLAS};
 
     // TBSV
-    //cuda2hipRename["cublasStbsv_v2"] = {"hipblasStbsv", CONV_BLAS};
-    //cuda2hipRename["cublasDtbsv_v2"] = {"hipblasDtbsv", CONV_BLAS};
-    //cuda2hipRename["cublasCtbsv_v2"] = {"hipblasCtbsv", CONV_BLAS};
-    //cuda2hipRename["cublasZtbsv_v2"] = {"hipblasZtbsv", CONV_BLAS};
+    //cuda2hipRename["cublasStbsv_v2"] = {"hipblasStbsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDtbsv_v2"] = {"hipblasDtbsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCtbsv_v2"] = {"hipblasCtbsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZtbsv_v2"] = {"hipblasZtbsv", CONV_MATH_FUNC, API_BLAS};
 
     // SYMV/HEMV
-    //cuda2hipRename["cublasSsymv_v2"] = {"hipblasSsymv", CONV_BLAS};
-    //cuda2hipRename["cublasDsymv_v2"] = {"hipblasDsymv", CONV_BLAS};
-    //cuda2hipRename["cublasCsymv_v2"] = {"hipblasCsymv", CONV_BLAS};
-    //cuda2hipRename["cublasZsymv_v2"] = {"hipblasZsymv", CONV_BLAS};
-    //cuda2hipRename["cublasChemv_v2"] = {"hipblasChemv", CONV_BLAS};
-    //cuda2hipRename["cublasZhemv_v2"] = {"hipblasZhemv", CONV_BLAS};
+    //cuda2hipRename["cublasSsymv_v2"] = {"hipblasSsymv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDsymv_v2"] = {"hipblasDsymv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCsymv_v2"] = {"hipblasCsymv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZsymv_v2"] = {"hipblasZsymv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasChemv_v2"] = {"hipblasChemv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZhemv_v2"] = {"hipblasZhemv", CONV_MATH_FUNC, API_BLAS};
 
     // SBMV/HBMV
-    //cuda2hipRename["cublasSsbmv_v2"] = {"hipblasSsbmv", CONV_BLAS};
-    //cuda2hipRename["cublasDsbmv_v2"] = {"hpiblasDsbmv", CONV_BLAS};
-    //cuda2hipRename["cublasChbmv_v2"] = {"hipblasChbmv", CONV_BLAS};
-    //cuda2hipRename["cublasZhbmv_v2"] = {"hipblasZhbmv", CONV_BLAS};
+    //cuda2hipRename["cublasSsbmv_v2"] = {"hipblasSsbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDsbmv_v2"] = {"hpiblasDsbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasChbmv_v2"] = {"hipblasChbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZhbmv_v2"] = {"hipblasZhbmv", CONV_MATH_FUNC, API_BLAS};
 
     // SPMV/HPMV
-    //cuda2hipRename["cublasSspmv_v2"] = {"hipblasSspmv", CONV_BLAS};
-    //cuda2hipRename["cublasDspmv_v2"] = {"hipblasDspmv", CONV_BLAS};
-    //cuda2hipRename["cublasChpmv_v2"] = {"hipblasChpmv", CONV_BLAS};
-    //cuda2hipRename["cublasZhpmv_v2"] = {"hipblasZhpmv", CONV_BLAS};
+    //cuda2hipRename["cublasSspmv_v2"] = {"hipblasSspmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDspmv_v2"] = {"hipblasDspmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasChpmv_v2"] = {"hipblasChpmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZhpmv_v2"] = {"hipblasZhpmv", CONV_MATH_FUNC, API_BLAS};
 
     // GER
-    cuda2hipRename["cublasSger_v2"]  = {"hipblasSger", CONV_BLAS};
-    //cuda2hipRename["cublasDger_v2"]  = {"hipblasDger", CONV_BLAS};
-    //cuda2hipRename["cublasCgeru_v2"] = {"hipblasCgeru", CONV_BLAS};
-    //cuda2hipRename["cublasCgerc_v2"] = {"hipblasCgerc", CONV_BLAS};
-    //cuda2hipRename["cublasZgeru_v2"] = {"hipblasZgeru", CONV_BLAS};
-    //cuda2hipRename["cublasZgerc_v2"] = {"hipblasZgerc", CONV_BLAS};
+    cuda2hipRename["cublasSger_v2"]  = {"hipblasSger", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDger_v2"]  = {"hipblasDger", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCgeru_v2"] = {"hipblasCgeru", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCgerc_v2"] = {"hipblasCgerc", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZgeru_v2"] = {"hipblasZgeru", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZgerc_v2"] = {"hipblasZgerc", CONV_MATH_FUNC, API_BLAS};
 
     // SYR/HER
-    //cuda2hipRename["cublasSsyr_v2"] = {"hipblasSsyr", CONV_BLAS};
-    //cuda2hipRename["cublasDsyr_v2"] = {"hipblasDsyr", CONV_BLAS};
-    //cuda2hipRename["cublasCsyr_v2"] = {"hipblasCsyr", CONV_BLAS};
-    //cuda2hipRename["cublasZsyr_v2"] = {"hipblasZsyr", CONV_BLAS};
-    //cuda2hipRename["cublasCher_v2"] = {"hipblasCher", CONV_BLAS};
-    //cuda2hipRename["cublasZher_v2"] = {"hipblasZher", CONV_BLAS};
+    //cuda2hipRename["cublasSsyr_v2"] = {"hipblasSsyr", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDsyr_v2"] = {"hipblasDsyr", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCsyr_v2"] = {"hipblasCsyr", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZsyr_v2"] = {"hipblasZsyr", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCher_v2"] = {"hipblasCher", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZher_v2"] = {"hipblasZher", CONV_MATH_FUNC, API_BLAS};
 
     // SPR/HPR
-    //cuda2hipRename["cublasSspr_v2"] = {"hipblasSspr", CONV_BLAS};
-    //cuda2hipRename["cublasDspr_v2"] = {"hipblasDspr", CONV_BLAS};
-    //cuda2hipRename["cublasChpr_v2"] = {"hipblasChpr", CONV_BLAS};
-    //cuda2hipRename["cublasZhpr_v2"] = {"hipblasZhpr", CONV_BLAS};
+    //cuda2hipRename["cublasSspr_v2"] = {"hipblasSspr", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDspr_v2"] = {"hipblasDspr", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasChpr_v2"] = {"hipblasChpr", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZhpr_v2"] = {"hipblasZhpr", CONV_MATH_FUNC, API_BLAS};
 
     // SYR2/HER2
-    //cuda2hipRename["cublasSsyr2_v2"] = {"hipblasSsyr2", CONV_BLAS};
-    //cuda2hipRename["cublasDsyr2_v2"] = {"hipblasDsyr2", CONV_BLAS};
-    //cuda2hipRename["cublasCsyr2_v2"] = {"hipblasCsyr2", CONV_BLAS};
-    //cuda2hipRename["cublasZsyr2_v2"] = {"hipblasZsyr2", CONV_BLAS};
-    //cuda2hipRename["cublasCher2_v2"] = {"hipblasCher2", CONV_BLAS};
-    //cuda2hipRename["cublasZher2_v2"] = {"hipblasZher2", CONV_BLAS};
+    //cuda2hipRename["cublasSsyr2_v2"] = {"hipblasSsyr2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDsyr2_v2"] = {"hipblasDsyr2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCsyr2_v2"] = {"hipblasCsyr2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZsyr2_v2"] = {"hipblasZsyr2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCher2_v2"] = {"hipblasCher2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZher2_v2"] = {"hipblasZher2", CONV_MATH_FUNC, API_BLAS};
 
     // SPR2/HPR2
-    //cuda2hipRename["cublasSspr2_v2"] = {"hipblasSspr2", CONV_BLAS};
-    //cuda2hipRename["cublasDspr2_v2"] = {"hipblasDspr2", CONV_BLAS};
-    //cuda2hipRename["cublasChpr2_v2"] = {"hipblasChpr2", CONV_BLAS};
-    //cuda2hipRename["cublasZhpr2_v2"] = {"hipblasZhpr2", CONV_BLAS};
+    //cuda2hipRename["cublasSspr2_v2"] = {"hipblasSspr2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDspr2_v2"] = {"hipblasDspr2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasChpr2_v2"] = {"hipblasChpr2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZhpr2_v2"] = {"hipblasZhpr2", CONV_MATH_FUNC, API_BLAS};
 
     // Blas3 (v2) Routines
     // GEMM
-    cuda2hipRename["cublasSgemm_v2"] = {"hipblasSgemm", CONV_BLAS};
-    //cuda2hipRename["cublasDgemm_v2"] = {"hipblasDgemm", CONV_BLAS};
-    cuda2hipRename["cublasCgemm_v2"] = {"hipblasCgemm", CONV_BLAS};
-    //cuda2hipRename["cublasZgemm_v2"] = {"hipblasZgemm", CONV_BLAS};
+    cuda2hipRename["cublasSgemm_v2"] = {"hipblasSgemm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDgemm_v2"] = {"hipblasDgemm", CONV_MATH_FUNC, API_BLAS};
+    cuda2hipRename["cublasCgemm_v2"] = {"hipblasCgemm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZgemm_v2"] = {"hipblasZgemm", CONV_MATH_FUNC, API_BLAS};
 
     //IO in FP16 / FP32, computation in float
     // cublasSgemmEx
 
     // SYRK
-    //cuda2hipRename["cublasSsyrk_v2"] = {"hipblasSsyrk", CONV_BLAS};
-    //cuda2hipRename["cublasDsyrk_v2"] = {"hipblasDsyrk", CONV_BLAS};
-    //cuda2hipRename["cublasCsyrk_v2"] = {"hipblasCsyrk", CONV_BLAS};
-    //cuda2hipRename["cublasZsyrk_v2"] = {"hipblasZsyrk", CONV_BLAS};
+    //cuda2hipRename["cublasSsyrk_v2"] = {"hipblasSsyrk", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDsyrk_v2"] = {"hipblasDsyrk", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCsyrk_v2"] = {"hipblasCsyrk", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZsyrk_v2"] = {"hipblasZsyrk", CONV_MATH_FUNC, API_BLAS};
 
     // HERK
-    //cuda2hipRename["cublasCherk_v2"] = {"hipblasCherk", CONV_BLAS};
-    //cuda2hipRename["cublasZherk_v2"] = {"hipblasZherk", CONV_BLAS};
+    //cuda2hipRename["cublasCherk_v2"] = {"hipblasCherk", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZherk_v2"] = {"hipblasZherk", CONV_MATH_FUNC, API_BLAS};
 
     // SYR2K
-    //cuda2hipRename["cublasSsyr2k_v2"] = {"hipblasSsyr2k", CONV_BLAS};
-    //cuda2hipRename["cublasDsyr2k_v2"] = {"hipblasDsyr2k", CONV_BLAS};
-    //cuda2hipRename["cublasCsyr2k_v2"] = {"hipblasCsyr2k", CONV_BLAS};
-    //cuda2hipRename["cublasZsyr2k_v2"] = {"hipblasZsyr2k", CONV_BLAS};
+    //cuda2hipRename["cublasSsyr2k_v2"] = {"hipblasSsyr2k", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDsyr2k_v2"] = {"hipblasDsyr2k", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCsyr2k_v2"] = {"hipblasCsyr2k", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZsyr2k_v2"] = {"hipblasZsyr2k", CONV_MATH_FUNC, API_BLAS};
 
     // HER2K
-    //cuda2hipRename["cublasCher2k_v2"] = {"hipblasCher2k", CONV_BLAS};
-    //cuda2hipRename["cublasZher2k_v2"] = {"hipblasZher2k", CONV_BLAS};
+    //cuda2hipRename["cublasCher2k_v2"] = {"hipblasCher2k", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZher2k_v2"] = {"hipblasZher2k", CONV_MATH_FUNC, API_BLAS};
 
     // SYMM
-    //cuda2hipRename["cublasSsymm_v2"] = {"hipblasSsymm", CONV_BLAS};
-    //cuda2hipRename["cublasDsymm_v2"] = {"hipblasDsymm", CONV_BLAS};
-    //cuda2hipRename["cublasCsymm_v2"] = {"hipblasCsymm", CONV_BLAS};
-    //cuda2hipRename["cublasZsymm_v2"] = {"hipblasZsymm", CONV_BLAS};
+    //cuda2hipRename["cublasSsymm_v2"] = {"hipblasSsymm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDsymm_v2"] = {"hipblasDsymm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCsymm_v2"] = {"hipblasCsymm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZsymm_v2"] = {"hipblasZsymm", CONV_MATH_FUNC, API_BLAS};
 
     // HEMM
-    //cuda2hipRename["cublasChemm_v2"] = {"hipblasChemm", CONV_BLAS};
-    //cuda2hipRename["cublasZhemm_v2"] = {"hipblasZhemm", CONV_BLAS};
+    //cuda2hipRename["cublasChemm_v2"] = {"hipblasChemm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZhemm_v2"] = {"hipblasZhemm", CONV_MATH_FUNC, API_BLAS};
 
     // TRSM
-    //cuda2hipRename["cublasStrsm_v2"] = {"hipblasStrsm", CONV_BLAS};
-    //cuda2hipRename["cublasDtrsm_v2"] = {"hipblasDtrsm", CONV_BLAS};
-    //cuda2hipRename["cublasCtrsm_v2"] = {"hipblasCtrsm", CONV_BLAS};
-    //cuda2hipRename["cublasZtrsm_v2"] = {"hipblasZtrsm", CONV_BLAS};
+    //cuda2hipRename["cublasStrsm_v2"] = {"hipblasStrsm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDtrsm_v2"] = {"hipblasDtrsm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCtrsm_v2"] = {"hipblasCtrsm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZtrsm_v2"] = {"hipblasZtrsm", CONV_MATH_FUNC, API_BLAS};
 
     // TRMM
-    //cuda2hipRename["cublasStrmm_v2"] = {"hipblasStrmm", CONV_BLAS};
-    //cuda2hipRename["cublasDtrmm_v2"] = {"hipblasDtrmm", CONV_BLAS};
-    //cuda2hipRename["cublasCtrmm_v2"] = {"hipblasCtrmm", CONV_BLAS};
-    //cuda2hipRename["cublasZtrmm_v2"] = {"hipblasZtrmm", CONV_BLAS};
+    //cuda2hipRename["cublasStrmm_v2"] = {"hipblasStrmm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDtrmm_v2"] = {"hipblasDtrmm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCtrmm_v2"] = {"hipblasCtrmm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZtrmm_v2"] = {"hipblasZtrmm", CONV_MATH_FUNC, API_BLAS};
 
     // NRM2
-    //cuda2hipRename["cublasSnrm2_v2"]  = {"hipblasSnrm2", CONV_BLAS};
-    //cuda2hipRename["cublasDnrm2_v2"]  = {"hipblasDnrm2", CONV_BLAS};
-    //cuda2hipRename["cublasScnrm2_v2"] = {"hipblasScnrm2", CONV_BLAS};
-    //cuda2hipRename["cublasDznrm2_v2"] = {"hipblasDznrm2", CONV_BLAS};
+    //cuda2hipRename["cublasSnrm2_v2"]  = {"hipblasSnrm2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDnrm2_v2"]  = {"hipblasDnrm2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasScnrm2_v2"] = {"hipblasScnrm2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDznrm2_v2"] = {"hipblasDznrm2", CONV_MATH_FUNC, API_BLAS};
 
     // DOT
-    cuda2hipRename["cublasSdot_v2"]  = {"hipblasSdot", CONV_BLAS};
-    cuda2hipRename["cublasDdot_v2"]  = {"hipblasDdot", CONV_BLAS};
-    //cuda2hipRename["cublasCdotu_v2"] = {"hipblasCdotu", CONV_BLAS};
-    //cuda2hipRename["cublasCdotc_v2"] = {"hipblasCdotc", CONV_BLAS};
-    //cuda2hipRename["cublasZdotu_v2"] = {"hipblasZdotu", CONV_BLAS};
-    //cuda2hipRename["cublasZdotc_v2"] = {"hipblasZdotc", CONV_BLAS};
+    cuda2hipRename["cublasSdot_v2"]  = {"hipblasSdot", CONV_MATH_FUNC, API_BLAS};
+    cuda2hipRename["cublasDdot_v2"]  = {"hipblasDdot", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCdotu_v2"] = {"hipblasCdotu", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCdotc_v2"] = {"hipblasCdotc", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZdotu_v2"] = {"hipblasZdotu", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZdotc_v2"] = {"hipblasZdotc", CONV_MATH_FUNC, API_BLAS};
 
     // SCAL
-    cuda2hipRename["cublasSscal_v2"]  = {"hipblasSscal", CONV_BLAS};
-    cuda2hipRename["cublasDscal_v2"]  = {"hipblasDscal", CONV_BLAS};
-    //cuda2hipRename["cublasCscal_v2"]  = {"hipblasCscal", CONV_BLAS};
-    //cuda2hipRename["cublasCsscal_v2"] = {"hipblasCsscal", CONV_BLAS};
-    //cuda2hipRename["cublasZscal_v2"]  = {"hipblasZscal", CONV_BLAS};
-    //cuda2hipRename["cublasZdscal_v2"] = {"hipblasZdscal", CONV_BLAS};
+    cuda2hipRename["cublasSscal_v2"]  = {"hipblasSscal", CONV_MATH_FUNC, API_BLAS};
+    cuda2hipRename["cublasDscal_v2"]  = {"hipblasDscal", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCscal_v2"]  = {"hipblasCscal", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCsscal_v2"] = {"hipblasCsscal", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZscal_v2"]  = {"hipblasZscal", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZdscal_v2"] = {"hipblasZdscal", CONV_MATH_FUNC, API_BLAS};
 
     // AXPY
-    cuda2hipRename["cublasSaxpy_v2"] = {"hipblasSaxpy", CONV_BLAS};
-    //cuda2hipRename["cublasDaxpy_v2"] = {"hipblasDaxpy", CONV_BLAS};
-    //cuda2hipRename["cublasCaxpy_v2"] = {"hipblasCaxpy", CONV_BLAS};
-    //cuda2hipRename["cublasZaxpy_v2"] = {"hipblasZaxpy", CONV_BLAS};
+    cuda2hipRename["cublasSaxpy_v2"] = {"hipblasSaxpy", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDaxpy_v2"] = {"hipblasDaxpy", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCaxpy_v2"] = {"hipblasCaxpy", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZaxpy_v2"] = {"hipblasZaxpy", CONV_MATH_FUNC, API_BLAS};
 
     // COPY
-    cuda2hipRename["cublasScopy_v2"] = {"hipblasScopy", CONV_BLAS};
-    cuda2hipRename["cublasDcopy_v2"] = {"hipblasDcopy", CONV_BLAS};
-    //cuda2hipRename["cublasCcopy_v2"] = {"hipblasCcopy", CONV_BLAS};
-    //cuda2hipRename["cublasZcopy_v2"] = {"hipblasZcopy", CONV_BLAS};
+    cuda2hipRename["cublasScopy_v2"] = {"hipblasScopy", CONV_MATH_FUNC, API_BLAS};
+    cuda2hipRename["cublasDcopy_v2"] = {"hipblasDcopy", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCcopy_v2"] = {"hipblasCcopy", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZcopy_v2"] = {"hipblasZcopy", CONV_MATH_FUNC, API_BLAS};
 
     // SWAP
-    //cuda2hipRename["cublasSswap_v2"] = {"hipblasSswap", CONV_BLAS};
-    //cuda2hipRename["cublasDswap_v2"] = {"hipblasDswap", CONV_BLAS};
-    //cuda2hipRename["cublasCswap_v2"] = {"hipblasCswap", CONV_BLAS};
-    //cuda2hipRename["cublasZswap_v2"] = {"hipblasZswap", CONV_BLAS};
+    //cuda2hipRename["cublasSswap_v2"] = {"hipblasSswap", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDswap_v2"] = {"hipblasDswap", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCswap_v2"] = {"hipblasCswap", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZswap_v2"] = {"hipblasZswap", CONV_MATH_FUNC, API_BLAS};
 
     // AMAX
-    //cuda2hipRename["cublasIsamax_v2"] = {"hipblasIsamax", CONV_BLAS};
-    //cuda2hipRename["cublasIdamax_v2"] = {"hipblasIdamax", CONV_BLAS};
-    //cuda2hipRename["cublasIcamax_v2"] = {"hipblasIcamax", CONV_BLAS};
-    //cuda2hipRename["cublasIzamax_v2"] = {"hipblasIzamax", CONV_BLAS};
+    //cuda2hipRename["cublasIsamax_v2"] = {"hipblasIsamax", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasIdamax_v2"] = {"hipblasIdamax", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasIcamax_v2"] = {"hipblasIcamax", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasIzamax_v2"] = {"hipblasIzamax", CONV_MATH_FUNC, API_BLAS};
 
     // AMIN
-    //cuda2hipRename["cublasIsamin_v2"] = {"hipblasIsamin", CONV_BLAS};
-    //cuda2hipRename["cublasIdamin_v2"] = {"hipblasIdamin", CONV_BLAS};
-    //cuda2hipRename["cublasIcamin_v2"] = {"hipblasIcamin", CONV_BLAS};
-    //cuda2hipRename["cublasIzamin_v2"] = {"hipblasIzamin", CONV_BLAS};
+    //cuda2hipRename["cublasIsamin_v2"] = {"hipblasIsamin", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasIdamin_v2"] = {"hipblasIdamin", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasIcamin_v2"] = {"hipblasIcamin", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasIzamin_v2"] = {"hipblasIzamin", CONV_MATH_FUNC, API_BLAS};
 
     // ASUM
-    cuda2hipRename["cublasSasum_v2"]  = {"hipblasSasum", CONV_BLAS};
-    cuda2hipRename["cublasDasum_v2"]  = {"hipblasDasum", CONV_BLAS};
-    //cuda2hipRename["cublasScasum_v2"] = {"hipblasScasum", CONV_BLAS};
-    //cuda2hipRename["cublasDzasum_v2"] = {"hipblasDzasum", CONV_BLAS};
+    cuda2hipRename["cublasSasum_v2"]  = {"hipblasSasum", CONV_MATH_FUNC, API_BLAS};
+    cuda2hipRename["cublasDasum_v2"]  = {"hipblasDasum", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasScasum_v2"] = {"hipblasScasum", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDzasum_v2"] = {"hipblasDzasum", CONV_MATH_FUNC, API_BLAS};
 
     // ROT
-    //cuda2hipRename["cublasSrot_v2"]  = {"hipblasSrot", CONV_BLAS};
-    //cuda2hipRename["cublasDrot_v2"]  = {"hipblasDrot", CONV_BLAS};
-    //cuda2hipRename["cublasCrot_v2"]  = {"hipblasCrot", CONV_BLAS};
-    //cuda2hipRename["cublasCsrot_v2"] = {"hipblasCsrot", CONV_BLAS};
-    //cuda2hipRename["cublasZrot_v2"]  = {"hipblasZrot", CONV_BLAS};
-    //cuda2hipRename["cublasZdrot_v2"] = {"hipblasZdrot", CONV_BLAS};
+    //cuda2hipRename["cublasSrot_v2"]  = {"hipblasSrot", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDrot_v2"]  = {"hipblasDrot", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCrot_v2"]  = {"hipblasCrot", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCsrot_v2"] = {"hipblasCsrot", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZrot_v2"]  = {"hipblasZrot", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZdrot_v2"] = {"hipblasZdrot", CONV_MATH_FUNC, API_BLAS};
 
     // ROTG
-    //cuda2hipRename["cublasSrotg_v2"] = {"hipblasSrotg", CONV_BLAS};
-    //cuda2hipRename["cublasDrotg_v2"] = {"hipblasDrotg", CONV_BLAS};
-    //cuda2hipRename["cublasCrotg_v2"] = {"hipblasCrotg", CONV_BLAS};
-    //cuda2hipRename["cublasZrotg_v2"] = {"hipblasZrotg", CONV_BLAS};
+    //cuda2hipRename["cublasSrotg_v2"] = {"hipblasSrotg", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDrotg_v2"] = {"hipblasDrotg", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCrotg_v2"] = {"hipblasCrotg", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZrotg_v2"] = {"hipblasZrotg", CONV_MATH_FUNC, API_BLAS};
 
     // ROTM
-    //cuda2hipRename["cublasSrotm_v2"] = {"hipblasSrotm", CONV_BLAS};
-    //cuda2hipRename["cublasDrotm_v2"] = {"hipblasDrotm", CONV_BLAS};
+    //cuda2hipRename["cublasSrotm_v2"] = {"hipblasSrotm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDrotm_v2"] = {"hipblasDrotm", CONV_MATH_FUNC, API_BLAS};
 
     // ROTMG
-    //cuda2hipRename["cublasSrotmg_v2"] = {"hipblasSrotmg", CONV_BLAS};
-    //cuda2hipRename["cublasDrotmg_v2"] = {"hipblasDrotmg", CONV_BLAS};
+    //cuda2hipRename["cublasSrotmg_v2"] = {"hipblasSrotmg", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDrotmg_v2"] = {"hipblasDrotmg", CONV_MATH_FUNC, API_BLAS};
   }
 
   struct HipNames {
     StringRef hipName;
     ConvTypes countType;
+    ApiTypes countApiType;
   };
 
   SmallDenseMap<StringRef, HipNames> cuda2hipRename;
@@ -1023,7 +1035,8 @@ StringRef unquoteStr(StringRef s) {
 static void processString(StringRef s, const cuda2hipMap &map,
                           Replacements *Replace, SourceManager &SM,
                           SourceLocation start,
-                          int64_t countReps[ConvTypes::CONV_LAST]) {
+                          int64_t countReps[CONV_LAST],
+                          int64_t countApiReps[API_LAST]) {
   size_t begin = 0;
   while ((begin = s.find("cuda", begin)) != StringRef::npos ||
          (begin = s.find("cublas", begin)) != StringRef::npos) {
@@ -1031,8 +1044,9 @@ static void processString(StringRef s, const cuda2hipMap &map,
     StringRef name = s.slice(begin, end);
     const auto found = map.cuda2hipRename.find(name);
     if (found != map.cuda2hipRename.end()) {
-      countReps[CONV_LITERAL]++;
       StringRef repName = found->second.hipName;
+      countReps[CONV_LITERAL]++;
+      countApiReps[API_RUNTIME]++;
       SourceLocation sl = start.getLocWithOffset(begin + 1);
       Replacement Rep(SM, sl, name.size(), repName);
       Replace->insert(Rep);
@@ -1074,8 +1088,9 @@ public:
       if (is_angled) {
         const auto found = N.cuda2hipRename.find(file_name);
         if (found != N.cuda2hipRename.end()) {
-          countReps[found->second.countType]++;
           StringRef repName = found->second.hipName;
+          countReps[found->second.countType]++;
+          countApiReps[found->second.countApiType]++;
           DEBUG(dbgs() << "Include file found: " << file_name << "\n"
                        << "SourceLocation:"
                        << filename_range.getBegin().printToString(*_sm) << "\n"
@@ -1102,8 +1117,9 @@ public:
           StringRef name = T.getIdentifierInfo()->getName();
           const auto found = N.cuda2hipRename.find(name);
           if (found != N.cuda2hipRename.end()) {
-            countReps[found->second.countType]++;
             StringRef repName = found->second.hipName;
+            countReps[found->second.countType]++;
+            countApiReps[found->second.countApiType]++;
             SourceLocation sl = T.getLocation();
             DEBUG(dbgs() << "Identifier " << name
                          << " found in definition of macro "
@@ -1148,8 +1164,9 @@ public:
               StringRef name = tok.getIdentifierInfo()->getName();
               const auto found = N.cuda2hipRename.find(name);
               if (found != N.cuda2hipRename.end()) {
-                countReps[found->second.countType]++;
                 StringRef repName = found->second.hipName;
+                countReps[found->second.countType]++;
+                countApiReps[found->second.countApiType]++;
                 DEBUG(dbgs()
                       << "Identifier " << name
                       << " found as an actual argument in expansion of macro "
@@ -1170,8 +1187,9 @@ public:
                 const auto found = N.cuda2hipRename.find(name);
                 if (found != N.cuda2hipRename.end()) {
                   sl = sl_macro;
-                  countReps[found->second.countType]++;
                   StringRef repName = found->second.hipName;
+                  countReps[found->second.countType]++;
+                  countApiReps[found->second.countApiType]++;
                   Replacement Rep(*_sm, sl, length, repName);
                   Replace->insert(Rep);
                 }
@@ -1179,7 +1197,7 @@ public:
                 if (tok.is(tok::string_literal)) {
                   StringRef s(tok.getLiteralData(), tok.getLength());
                   processString(unquoteStr(s), N, Replace, *_sm, tok.getLocation(),
-                    countReps);
+                    countReps, countApiReps);
                 }
               }
             }
@@ -1195,7 +1213,8 @@ public:
   void setSourceManager(SourceManager *sm) { _sm = sm; }
   void setPreprocessor(Preprocessor *pp) { _pp = pp; }
   void setMatch(Cuda2HipCallback *match) { Match = match; }
-  int64_t countReps[ConvTypes::CONV_LAST] = {0};
+  int64_t countReps[CONV_LAST]  = { 0 };
+  int64_t countApiReps[API_LAST] = { 0 };
 
 private:
   SourceManager *_sm;
@@ -1206,65 +1225,49 @@ private:
 };
 
 class Cuda2HipCallback : public MatchFinder::MatchCallback {
-public:
-  Cuda2HipCallback(Replacements *Replace, ast_matchers::MatchFinder *parent, HipifyPPCallbacks *PPCallbacks)
-    : Replace(Replace), owner(parent), PP(PPCallbacks) {
-    PP->setMatch(this);
-  }
-
-  void convertKernelDecl(const FunctionDecl *kernelDecl,
-                         const MatchFinder::MatchResult &Result) {
+private:
+  void convertKernelDecl(const FunctionDecl *kernelDecl, const MatchFinder::MatchResult &Result) {
     SourceManager *SM = Result.SourceManager;
     LangOptions DefaultLangOptions;
-
     SmallString<40> XStr;
     raw_svector_ostream OS(XStr);
     StringRef initialParamList;
     OS << "hipLaunchParm lp";
-    size_t replacementLength = OS.str().size();
+    size_t repLength = OS.str().size();
     SourceLocation sl = kernelDecl->getNameInfo().getEndLoc();
     SourceLocation kernelArgListStart = Lexer::findLocationAfterToken(
-        sl, tok::l_paren, *SM, DefaultLangOptions, true);
+      sl, tok::l_paren, *SM, DefaultLangOptions, true);
     DEBUG(dbgs() << kernelArgListStart.printToString(*SM));
     if (kernelDecl->getNumParams() > 0) {
       const ParmVarDecl *pvdFirst = kernelDecl->getParamDecl(0);
-      const ParmVarDecl *pvdLast =
-          kernelDecl->getParamDecl(kernelDecl->getNumParams() - 1);
+      const ParmVarDecl *pvdLast =  kernelDecl->getParamDecl(kernelDecl->getNumParams() - 1);
       SourceLocation kernelArgListStart(pvdFirst->getLocStart());
       SourceLocation kernelArgListEnd(pvdLast->getLocEnd());
-      SourceLocation stop = Lexer::getLocForEndOfToken(
-          kernelArgListEnd, 0, *SM, DefaultLangOptions);
-      replacementLength +=
-          SM->getCharacterData(stop) - SM->getCharacterData(kernelArgListStart);
-      initialParamList = StringRef(SM->getCharacterData(kernelArgListStart),
-                                   replacementLength);
+      SourceLocation stop = Lexer::getLocForEndOfToken(kernelArgListEnd, 0, *SM, DefaultLangOptions);
+      repLength += SM->getCharacterData(stop) - SM->getCharacterData(kernelArgListStart);
+      initialParamList = StringRef(SM->getCharacterData(kernelArgListStart), repLength);
       OS << ", " << initialParamList;
     }
-    DEBUG(dbgs() << "initial paramlist: " << initialParamList << "\n"
-                 << "new paramlist: " << OS.str() << "\n");
-    Replacement Rep0(*(Result.SourceManager), kernelArgListStart,
-                     replacementLength, OS.str());
+    DEBUG(dbgs() << "initial paramlist: " << initialParamList << "\n" << "new paramlist: " << OS.str() << "\n");
+    Replacement Rep0(*(Result.SourceManager), kernelArgListStart, repLength, OS.str());
     Replace->insert(Rep0);
   }
 
-  void run(const MatchFinder::MatchResult &Result) override {
-    SourceManager *SM = Result.SourceManager;
-    LangOptions DefaultLangOptions;
-
-    if (const CallExpr *call =
-            Result.Nodes.getNodeAs<CallExpr>("cudaCall")) {
+  bool cudaCall(const MatchFinder::MatchResult &Result) {
+    if (const CallExpr *call = Result.Nodes.getNodeAs<CallExpr>("cudaCall")) {
       const FunctionDecl *funcDcl = call->getDirectCallee();
       StringRef name = funcDcl->getDeclName().getAsString();
       const auto found = N.cuda2hipRename.find(name);
       if (found != N.cuda2hipRename.end()) {
+        SourceManager *SM = Result.SourceManager;
         StringRef repName = found->second.hipName;
         SourceLocation sl = call->getLocStart();
         size_t length = name.size();
         bool bReplace = true;
         if (SM->isMacroArgExpansion(sl)) {
           sl = SM->getImmediateSpellingLoc(sl);
-        }
-        else if (SM->isMacroBodyExpansion(sl)) {
+        } else if (SM->isMacroBodyExpansion(sl)) {
+          LangOptions DefaultLangOptions;
           SourceLocation sl_macro = SM->getExpansionLoc(sl);
           SourceLocation sl_end = Lexer::getLocForEndOfToken(sl_macro, 0, *SM, DefaultLangOptions);
           length = SM->getCharacterData(sl_end) - SM->getCharacterData(sl_macro);
@@ -1277,14 +1280,18 @@ public:
         }
         if (bReplace) {
           countReps[found->second.countType]++;
+          countApiReps[found->second.countApiType]++;
           Replacement Rep(*SM, sl, length, repName);
           Replace->insert(Rep);
         }
       }
+      return true;
     }
+    return false;
+  }
 
-    if (const CUDAKernelCallExpr *launchKernel =
-            Result.Nodes.getNodeAs<CUDAKernelCallExpr>("cudaLaunchKernel")) {
+  bool cudaLaunchKernel(const MatchFinder::MatchResult &Result) {
+    if (const CUDAKernelCallExpr *launchKernel = Result.Nodes.getNodeAs<CUDAKernelCallExpr>("cudaLaunchKernel")) {
       SmallString<40> XStr;
       raw_svector_ostream OS(XStr);
       StringRef calleeName;
@@ -1295,78 +1302,66 @@ public:
       } else {
         const Expr *e = launchKernel->getCallee();
         if (const UnresolvedLookupExpr *ule =
-                dyn_cast<UnresolvedLookupExpr>(e)) {
+          dyn_cast<UnresolvedLookupExpr>(e)) {
           calleeName = ule->getName().getAsIdentifierInfo()->getName();
           owner->addMatcher(functionTemplateDecl(hasName(calleeName))
-                                .bind("unresolvedTemplateName"),
-                            this);
+            .bind("unresolvedTemplateName"),
+            this);
         }
       }
-
       XStr.clear();
       OS << "hipLaunchKernel(HIP_KERNEL_NAME(" << calleeName << "),";
-
       const CallExpr *config = launchKernel->getConfig();
-      DEBUG(dbgs() << "Kernel config arguments:"
-                   << "\n");
+      DEBUG(dbgs() << "Kernel config arguments:" << "\n");
+      SourceManager *SM = Result.SourceManager;
+      LangOptions DefaultLangOptions;
       for (unsigned argno = 0; argno < config->getNumArgs(); argno++) {
         const Expr *arg = config->getArg(argno);
         if (!isa<CXXDefaultArgExpr>(arg)) {
-          const ParmVarDecl *pvd =
-              config->getDirectCallee()->getParamDecl(argno);
-
+          const ParmVarDecl *pvd = config->getDirectCallee()->getParamDecl(argno);
           SourceLocation sl(arg->getLocStart());
           SourceLocation el(arg->getLocEnd());
-          SourceLocation stop =
-              Lexer::getLocForEndOfToken(el, 0, *SM, DefaultLangOptions);
-          StringRef outs(SM->getCharacterData(sl),
-                         SM->getCharacterData(stop) - SM->getCharacterData(sl));
-          DEBUG(dbgs() << "args[ " << argno << "]" << outs << " <"
-                       << pvd->getType().getAsString() << ">"
-                       << "\n");
-          if (pvd->getType().getAsString().compare("dim3") == 0)
+          SourceLocation stop = Lexer::getLocForEndOfToken(el, 0, *SM, DefaultLangOptions);
+          StringRef outs(SM->getCharacterData(sl), SM->getCharacterData(stop) - SM->getCharacterData(sl));
+          DEBUG(dbgs() << "args[ " << argno << "]" << outs << " <" << pvd->getType().getAsString() << ">\n");
+          if (pvd->getType().getAsString().compare("dim3") == 0) {
             OS << " dim3(" << outs << "),";
-          else
+          } else {
             OS << " " << outs << ",";
-        } else
+          }
+        } else {
           OS << " 0,";
+        }
       }
-
       for (unsigned argno = 0; argno < launchKernel->getNumArgs(); argno++) {
         const Expr *arg = launchKernel->getArg(argno);
         SourceLocation sl(arg->getLocStart());
         SourceLocation el(arg->getLocEnd());
-        SourceLocation stop =
-            Lexer::getLocForEndOfToken(el, 0, *SM, DefaultLangOptions);
-        std::string outs(SM->getCharacterData(sl),
-                         SM->getCharacterData(stop) - SM->getCharacterData(sl));
+        SourceLocation stop = Lexer::getLocForEndOfToken(el, 0, *SM, DefaultLangOptions);
+        std::string outs(SM->getCharacterData(sl), SM->getCharacterData(stop) - SM->getCharacterData(sl));
         DEBUG(dbgs() << outs << "\n");
         OS << " " << outs << ",";
       }
       XStr.pop_back();
       OS << ")";
-      size_t length =
-          SM->getCharacterData(Lexer::getLocForEndOfToken(
-              launchKernel->getLocEnd(), 0, *SM, DefaultLangOptions)) -
-          SM->getCharacterData(launchKernel->getLocStart());
+      size_t length = SM->getCharacterData(Lexer::getLocForEndOfToken(
+                        launchKernel->getLocEnd(), 0, *SM, DefaultLangOptions)) -
+                        SM->getCharacterData(launchKernel->getLocStart());
       Replacement Rep(*SM, launchKernel->getLocStart(), length, OS.str());
       Replace->insert(Rep);
-      countReps[ConvTypes::CONV_KERN]++;
+      countReps[CONV_KERN]++;
+      countApiReps[API_RUNTIME]++;
+      return true;
     }
+    return false;
+  }
 
-    if (const FunctionTemplateDecl *templateDecl =
-            Result.Nodes.getNodeAs<FunctionTemplateDecl>(
-                "unresolvedTemplateName")) {
-      FunctionDecl *kernelDecl = templateDecl->getTemplatedDecl();
-      convertKernelDecl(kernelDecl, Result);
-    }
-
-    if (const MemberExpr *threadIdx =
-            Result.Nodes.getNodeAs<MemberExpr>("cudaBuiltin")) {
+  bool cudaBuiltin(const MatchFinder::MatchResult &Result) {
+    if (const MemberExpr *threadIdx = Result.Nodes.getNodeAs<MemberExpr>("cudaBuiltin")) {
       if (const OpaqueValueExpr *refBase =
-              dyn_cast<OpaqueValueExpr>(threadIdx->getBase())) {
+        dyn_cast<OpaqueValueExpr>(threadIdx->getBase())) {
         if (const DeclRefExpr *declRef =
-                dyn_cast<DeclRefExpr>(refBase->getSourceExpr())) {
+          dyn_cast<DeclRefExpr>(refBase->getSourceExpr())) {
           StringRef name = declRef->getDecl()->getName();
           StringRef memberName = threadIdx->getMemberDecl()->getName();
           size_t pos = memberName.find_first_not_of("__fetch_builtin_");
@@ -1375,51 +1370,66 @@ public:
           name = Twine(name + "." + memberName).toStringRef(tmpData);
           const auto found = N.cuda2hipRename.find(name);
           if (found != N.cuda2hipRename.end()) {
-            countReps[found->second.countType]++;
             StringRef repName = found->second.hipName;
+            countReps[found->second.countType]++;
+            countApiReps[found->second.countApiType]++;
             SourceLocation sl = threadIdx->getLocStart();
+            SourceManager *SM = Result.SourceManager;
             Replacement Rep(*SM, sl, name.size(), repName);
             Replace->insert(Rep);
           }
         }
       }
+      return true;
     }
+    return false;
+  }
 
-    if (const DeclRefExpr *cudaEnumConstantRef =
-            Result.Nodes.getNodeAs<DeclRefExpr>("cudaEnumConstantRef")) {
-      StringRef name = cudaEnumConstantRef->getDecl()->getNameAsString();
+  bool cudaEnumConstantRef(const MatchFinder::MatchResult &Result) {
+    if (const DeclRefExpr *enumConstantRef = Result.Nodes.getNodeAs<DeclRefExpr>("cudaEnumConstantRef")) {
+      StringRef name = enumConstantRef->getDecl()->getNameAsString();
       const auto found = N.cuda2hipRename.find(name);
       if (found != N.cuda2hipRename.end()) {
-        countReps[found->second.countType]++;
         StringRef repName = found->second.hipName;
-        SourceLocation sl = cudaEnumConstantRef->getLocStart();
+        countReps[found->second.countType]++;
+        countApiReps[found->second.countApiType]++;
+        SourceLocation sl = enumConstantRef->getLocStart();
+        SourceManager *SM = Result.SourceManager;
         Replacement Rep(*SM, sl, name.size(), repName);
         Replace->insert(Rep);
       }
+      return true;
     }
+    return false;
+  }
 
-    if (const VarDecl *cudaEnumConstantDecl =
-            Result.Nodes.getNodeAs<VarDecl>("cudaEnumConstantDecl")) {
+  bool cudaEnumConstantDecl(const MatchFinder::MatchResult &Result) {
+    if (const VarDecl *enumConstantDecl = Result.Nodes.getNodeAs<VarDecl>("cudaEnumConstantDecl")) {
       StringRef name =
-          cudaEnumConstantDecl->getType()->getAsTagDecl()->getNameAsString();
+        enumConstantDecl->getType()->getAsTagDecl()->getNameAsString();
       // anonymous typedef enum
       if (name.empty()) {
-        QualType QT = cudaEnumConstantDecl->getType().getUnqualifiedType();
+        QualType QT = enumConstantDecl->getType().getUnqualifiedType();
         name = QT.getAsString();
       }
       const auto found = N.cuda2hipRename.find(name);
       if (found != N.cuda2hipRename.end()) {
-        countReps[found->second.countType]++;
         StringRef repName = found->second.hipName;
-        SourceLocation sl = cudaEnumConstantDecl->getLocStart();
+        countReps[found->second.countType]++;
+        countApiReps[found->second.countApiType]++;
+        SourceLocation sl = enumConstantDecl->getLocStart();
+        SourceManager *SM = Result.SourceManager;
         Replacement Rep(*SM, sl, name.size(), repName);
         Replace->insert(Rep);
       }
+      return true;
     }
+    return false;
+  }
 
-    if (const VarDecl *cudaTypedefVar =
-      Result.Nodes.getNodeAs<VarDecl>("cudaTypedefVar")) {
-      QualType QT = cudaTypedefVar->getType();
+  bool cudaTypedefVar(const MatchFinder::MatchResult &Result) {
+    if (const VarDecl *typedefVar = Result.Nodes.getNodeAs<VarDecl>("cudaTypedefVar")) {
+      QualType QT = typedefVar->getType();
       if (QT->isArrayType()) {
         QT = QT.getTypePtr()->getAsArrayTypeUnsafe()->getElementType();
       }
@@ -1427,33 +1437,87 @@ public:
       StringRef name = QT.getAsString();
       const auto found = N.cuda2hipRename.find(name);
       if (found != N.cuda2hipRename.end()) {
-        countReps[found->second.countType]++;
         StringRef repName = found->second.hipName;
-        SourceLocation sl = cudaTypedefVar->getLocStart();
+        countReps[found->second.countType]++;
+        countApiReps[found->second.countApiType]++;
+        SourceLocation sl = typedefVar->getLocStart();
+        SourceManager *SM = Result.SourceManager;
         Replacement Rep(*SM, sl, name.size(), repName);
         Replace->insert(Rep);
       }
+      return true;
     }
+    return false;
+  }
 
-    if (const VarDecl *cudaStructVar =
-            Result.Nodes.getNodeAs<VarDecl>("cudaStructVar")) {
-      StringRef name = cudaStructVar->getType()
-                           ->getAsStructureType()
-                           ->getDecl()
-                           ->getNameAsString();
+  bool cudaStructVar(const MatchFinder::MatchResult &Result) {
+    if (const VarDecl *structVar = Result.Nodes.getNodeAs<VarDecl>("cudaStructVar")) {
+      StringRef name = structVar->getType()
+        ->getAsStructureType()
+        ->getDecl()
+        ->getNameAsString();
       const auto found = N.cuda2hipRename.find(name);
       if (found != N.cuda2hipRename.end()) {
-        countReps[found->second.countType]++;
         StringRef repName = found->second.hipName;
-        TypeLoc TL = cudaStructVar->getTypeSourceInfo()->getTypeLoc();
+        countReps[found->second.countType]++;
+        countApiReps[found->second.countApiType]++;
+        TypeLoc TL = structVar->getTypeSourceInfo()->getTypeLoc();
         SourceLocation sl = TL.getUnqualifiedLoc().getLocStart();
+        SourceManager *SM = Result.SourceManager;
         Replacement Rep(*SM, sl, name.size(), repName);
         Replace->insert(Rep);
       }
+      return true;
     }
+    return false;
+  }
 
-    if (const VarDecl *sharedVar =
-      Result.Nodes.getNodeAs<VarDecl>("cudaSharedIncompleteArrayVar")) {
+  bool cudaStructVarPtr(const MatchFinder::MatchResult &Result) {
+    if (const VarDecl *structVarPtr = Result.Nodes.getNodeAs<VarDecl>("cudaStructVarPtr")) {
+      const Type *t = structVarPtr->getType().getTypePtrOrNull();
+      if (t) {
+        StringRef name = t->getPointeeCXXRecordDecl()->getName();
+        const auto found = N.cuda2hipRename.find(name);
+        if (found != N.cuda2hipRename.end()) {
+          StringRef repName = found->second.hipName;
+          countReps[found->second.countType]++;
+          countApiReps[found->second.countApiType]++;
+          TypeLoc TL = structVarPtr->getTypeSourceInfo()->getTypeLoc();
+          SourceLocation sl = TL.getUnqualifiedLoc().getLocStart();
+          SourceManager *SM = Result.SourceManager;
+          Replacement Rep(*SM, sl, name.size(), repName);
+          Replace->insert(Rep);
+        }
+      }
+      return true;
+    }
+    return false;
+  }
+
+  bool cudaStructSizeOf(const MatchFinder::MatchResult &Result) {
+    if (const UnaryExprOrTypeTraitExpr *expr = Result.Nodes.getNodeAs<UnaryExprOrTypeTraitExpr>("cudaStructSizeOf")) {
+      TypeSourceInfo *typeInfo = expr->getArgumentTypeInfo();
+      QualType QT = typeInfo->getType().getUnqualifiedType();
+      const Type *type = QT.getTypePtr();
+      StringRef name = type->getAsCXXRecordDecl()->getName();
+      const auto found = N.cuda2hipRename.find(name);
+      if (found != N.cuda2hipRename.end()) {
+        StringRef repName = found->second.hipName;
+        countReps[found->second.countType]++;
+        countApiReps[found->second.countApiType]++;
+        TypeLoc TL = typeInfo->getTypeLoc();
+        SourceLocation sl = TL.getUnqualifiedLoc().getLocStart();
+        SourceManager *SM = Result.SourceManager;
+        Replacement Rep(*SM, sl, name.size(), repName);
+        Replace->insert(Rep);
+      }
+      return true;
+    }
+    return false;
+  }
+
+  bool cudaSharedIncompleteArrayVar(const MatchFinder::MatchResult &Result) {
+    if (const VarDecl *sharedVar = Result.Nodes.getNodeAs<VarDecl>("cudaSharedIncompleteArrayVar")) {
       // Example: extern __shared__ uint sRadix1[];
       if (sharedVar->hasExternalFormalLinkage()) {
         QualType QT = sharedVar->getType();
@@ -1477,6 +1541,7 @@ public:
         if (!typeName.empty()) {
           SourceLocation slStart = sharedVar->getLocStart();
           SourceLocation slEnd = sharedVar->getLocEnd();
+          SourceManager *SM = Result.SourceManager;
           size_t repLength = SM->getCharacterData(slEnd) - SM->getCharacterData(slStart) + 1;
           SmallString<128> tmpData;
           StringRef varName = sharedVar->getNameAsString();
@@ -1484,30 +1549,17 @@ public:
           Replacement Rep(*SM, slStart, repLength, repName);
           Replace->insert(Rep);
           countReps[CONV_MEM]++;
+          countApiReps[API_RUNTIME]++;
         }
       }
+      return true;
     }
+    return false;
+  }
 
-    if (const VarDecl *cudaStructVarPtr =
-            Result.Nodes.getNodeAs<VarDecl>("cudaStructVarPtr")) {
-      const Type *t = cudaStructVarPtr->getType().getTypePtrOrNull();
-      if (t) {
-        StringRef name = t->getPointeeCXXRecordDecl()->getName();
-        const auto found = N.cuda2hipRename.find(name);
-        if (found != N.cuda2hipRename.end()) {
-          countReps[found->second.countType]++;
-          StringRef repName = found->second.hipName;
-          TypeLoc TL = cudaStructVarPtr->getTypeSourceInfo()->getTypeLoc();
-          SourceLocation sl = TL.getUnqualifiedLoc().getLocStart();
-          Replacement Rep(*SM, sl, name.size(), repName);
-          Replace->insert(Rep);
-        }
-      }
-    }
-
-    if (const ParmVarDecl *cudaParamDecl =
-            Result.Nodes.getNodeAs<ParmVarDecl>("cudaParamDecl")) {
-      QualType QT = cudaParamDecl->getOriginalType().getUnqualifiedType();
+  bool cudaParamDecl(const MatchFinder::MatchResult &Result) {
+    if (const ParmVarDecl *paramDecl = Result.Nodes.getNodeAs<ParmVarDecl>("cudaParamDecl")) {
+      QualType QT = paramDecl->getOriginalType().getUnqualifiedType();
       StringRef name = QT.getAsString();
       const Type *t = QT.getTypePtr();
       if (t->isStructureOrClassType()) {
@@ -1515,73 +1567,104 @@ public:
       }
       const auto found = N.cuda2hipRename.find(name);
       if (found != N.cuda2hipRename.end()) {
-        countReps[found->second.countType]++;
         StringRef repName = found->second.hipName;
-        TypeLoc TL = cudaParamDecl->getTypeSourceInfo()->getTypeLoc();
+        countReps[found->second.countType]++;
+        countApiReps[found->second.countApiType]++;
+        TypeLoc TL = paramDecl->getTypeSourceInfo()->getTypeLoc();
         SourceLocation sl = TL.getUnqualifiedLoc().getLocStart();
+        SourceManager *SM = Result.SourceManager;
         Replacement Rep(*SM, sl, name.size(), repName);
         Replace->insert(Rep);
       }
+      return true;
     }
+    return false;
+  }
 
-    if (const ParmVarDecl *cudaParamDeclPtr =
-            Result.Nodes.getNodeAs<ParmVarDecl>("cudaParamDeclPtr")) {
-      const Type *pt = cudaParamDeclPtr->getType().getTypePtrOrNull();
+  bool cudaParamDeclPtr(const MatchFinder::MatchResult &Result) {
+    if (const ParmVarDecl *paramDeclPtr = Result.Nodes.getNodeAs<ParmVarDecl>("cudaParamDeclPtr")) {
+      const Type *pt = paramDeclPtr->getType().getTypePtrOrNull();
       if (pt) {
         QualType QT = pt->getPointeeType();
         const Type *t = QT.getTypePtr();
         StringRef name = t->isStructureOrClassType()
-                             ? t->getAsCXXRecordDecl()->getName()
-                             : StringRef(QT.getAsString());
+          ? t->getAsCXXRecordDecl()->getName()
+          : StringRef(QT.getAsString());
         const auto found = N.cuda2hipRename.find(name);
         if (found != N.cuda2hipRename.end()) {
-          countReps[found->second.countType]++;
           StringRef repName = found->second.hipName;
-          TypeLoc TL = cudaParamDeclPtr->getTypeSourceInfo()->getTypeLoc();
+          countReps[found->second.countType]++;
+          countApiReps[found->second.countApiType]++;
+          TypeLoc TL = paramDeclPtr->getTypeSourceInfo()->getTypeLoc();
           SourceLocation sl = TL.getUnqualifiedLoc().getLocStart();
+          SourceManager *SM = Result.SourceManager;
           Replacement Rep(*SM, sl, name.size(), repName);
           Replace->insert(Rep);
         }
       }
+      return true;
     }
+    return false;
+  }
 
-    if (const StringLiteral *stringLiteral =
-            Result.Nodes.getNodeAs<StringLiteral>("stringLiteral")) {
-      if (stringLiteral->getCharByteWidth() == 1) {
-        StringRef s = stringLiteral->getString();
-        processString(s, N, Replace, *SM, stringLiteral->getLocStart(),
-                      countReps);
+  bool unresolvedTemplateName(const MatchFinder::MatchResult &Result) {
+    if (const FunctionTemplateDecl *templateDecl = Result.Nodes.getNodeAs<FunctionTemplateDecl>("unresolvedTemplateName")) {
+      FunctionDecl *kernelDecl = templateDecl->getTemplatedDecl();
+      convertKernelDecl(kernelDecl, Result);
+      return true;
+    }
+    return false;
+  }
+
+  bool stringLiteral(const MatchFinder::MatchResult &Result) {
+    if (const StringLiteral *sLiteral = Result.Nodes.getNodeAs<StringLiteral>("stringLiteral")) {
+      if (sLiteral->getCharByteWidth() == 1) {
+        StringRef s = sLiteral->getString();
+        SourceManager *SM = Result.SourceManager;
+        processString(s, N, Replace, *SM, sLiteral->getLocStart(), countReps, countApiReps);
       }
+      return true;
     }
+    return false;
+  }
 
-    if (const UnaryExprOrTypeTraitExpr *expr =
-            Result.Nodes.getNodeAs<UnaryExprOrTypeTraitExpr>(
-                "cudaStructSizeOf")) {
-      TypeSourceInfo *typeInfo = expr->getArgumentTypeInfo();
-      QualType QT = typeInfo->getType().getUnqualifiedType();
-      const Type *type = QT.getTypePtr();
-      StringRef name = type->getAsCXXRecordDecl()->getName();
-      const auto found = N.cuda2hipRename.find(name);
-      if (found != N.cuda2hipRename.end()) {
-        countReps[found->second.countType]++;
-        StringRef repName = found->second.hipName;
-        TypeLoc TL = typeInfo->getTypeLoc();
-        SourceLocation sl = TL.getUnqualifiedLoc().getLocStart();
-        Replacement Rep(*SM, sl, name.size(), repName);
-        Replace->insert(Rep);
-      }
-    }
+public:
+  Cuda2HipCallback(Replacements *Replace, ast_matchers::MatchFinder *parent, HipifyPPCallbacks *PPCallbacks)
+    : Replace(Replace), owner(parent), PP(PPCallbacks) {
+    PP->setMatch(this);
+  }
 
+  void run(const MatchFinder::MatchResult &Result) override {
+    do {
+      if (cudaCall(Result)) break;
+      if (cudaLaunchKernel(Result)) break;
+      if (cudaBuiltin(Result)) break;
+      if (cudaEnumConstantRef(Result)) break;
+      if (cudaEnumConstantDecl(Result)) break;
+      if (cudaTypedefVar(Result)) break;
+      if (cudaStructVar(Result)) break;
+      if (cudaStructVarPtr(Result)) break;
+      if (cudaStructSizeOf(Result)) break;
+      if (cudaSharedIncompleteArrayVar(Result)) break;
+      if (cudaParamDecl(Result)) break;
+      if (cudaParamDeclPtr(Result)) break;
+      if (stringLiteral(Result)) break;
+      if (unresolvedTemplateName(Result)) break;
+      break;
+    } while (false);
     if (PP->countReps[CONV_INCLUDE_CUDA_MAIN_H] == 0 &&
             countReps[CONV_INCLUDE_CUDA_MAIN_H] == 0 && Replace->size() > 0) {
       StringRef repName = "#include <hip_runtime.h>\n";
+      SourceManager *SM = Result.SourceManager;
       Replacement Rep(*SM, SM->getLocForStartOfFile(SM->getMainFileID()), 0, repName);
       Replace->insert(Rep);
       countReps[CONV_INCLUDE_CUDA_MAIN_H]++;
+      countApiReps[API_RUNTIME]++;
     }
   }
 
-  int64_t countReps[ConvTypes::CONV_LAST] = {0};
+  int64_t countReps[CONV_LAST]  = { 0 };
+  int64_t countApiReps[API_LAST] = { 0 };
 
 private:
   Replacements *Replace;
@@ -1592,11 +1675,12 @@ private:
 
 void HipifyPPCallbacks::handleEndSource() {
   if (Match->countReps[CONV_INCLUDE_CUDA_MAIN_H] == 0 &&
-    countReps[CONV_INCLUDE_CUDA_MAIN_H] == 0 && Replace->size() > 0) {
+             countReps[CONV_INCLUDE_CUDA_MAIN_H] == 0 && Replace->size() > 0) {
     StringRef repName = "#include <hip_runtime.h>\n";
     Replacement Rep(*_sm, _sm->getLocForStartOfFile(_sm->getMainFileID()), 0, repName);
     Replace->insert(Rep);
     countReps[CONV_INCLUDE_CUDA_MAIN_H]++;
+    countApiReps[API_RUNTIME]++;
   }
 }
 
@@ -1621,18 +1705,90 @@ static cl::opt<bool>
 
 static cl::opt<bool>
     PrintStats("print-stats", cl::desc("print the command-line, like a header"),
-               cl::value_desc("print-stats"));
+    cl::value_desc("print-stats"));
+
+void addAllMatchers(ast_matchers::MatchFinder &Finder, Cuda2HipCallback *Callback) {
+  Finder.addMatcher(callExpr(isExpansionInMainFile(),
+                             callee(functionDecl(matchesName("cuda.*|cublas.*"))))
+                             .bind("cudaCall"),
+                             Callback);
+  Finder.addMatcher(cudaKernelCallExpr().bind("cudaLaunchKernel"), Callback);
+  Finder.addMatcher(memberExpr(isExpansionInMainFile(),
+                               hasObjectExpression(hasType(cxxRecordDecl(
+                               matchesName("__cuda_builtin_")))))
+                               .bind("cudaBuiltin"),
+                               Callback);
+  Finder.addMatcher(declRefExpr(isExpansionInMainFile(),
+                                to(enumConstantDecl(
+                                matchesName("cuda.*|cublas.*|CUDA.*|CUBLAS*"))))
+                                .bind("cudaEnumConstantRef"),
+                                Callback);
+  Finder.addMatcher(varDecl(isExpansionInMainFile(),
+                            hasType(enumDecl()))
+                            .bind("cudaEnumConstantDecl"),
+                            Callback);
+  Finder.addMatcher(varDecl(isExpansionInMainFile(),
+                            hasType(typedefDecl(matchesName("cuda.*|cublas.*"))))
+                            .bind("cudaTypedefVar"),
+                            Callback);
+  // Array of elements of typedef type, Example: cudaStream_t streams[2];
+  Finder.addMatcher(varDecl(isExpansionInMainFile(),
+                            hasType(arrayType(hasElementType(typedefType(
+                            hasDeclaration(typedefDecl(matchesName("cuda.*|cublas.*"))))))))
+                            .bind("cudaTypedefVar"),
+                            Callback);
+  Finder.addMatcher(varDecl(isExpansionInMainFile(),
+                            hasType(cxxRecordDecl(matchesName("cuda.*|cublas.*"))))
+                            .bind("cudaStructVar"),
+                            Callback);
+  Finder.addMatcher(varDecl(isExpansionInMainFile(),
+                            hasType(pointsTo(cxxRecordDecl(
+                                             matchesName("cuda.*|cublas.*")))))
+                            .bind("cudaStructVarPtr"),
+                            Callback);
+  Finder.addMatcher(parmVarDecl(isExpansionInMainFile(),
+                                hasType(namedDecl(matchesName("cuda.*|cublas.*"))))
+                                .bind("cudaParamDecl"),
+                                Callback);
+  Finder.addMatcher(parmVarDecl(isExpansionInMainFile(),
+                                hasType(pointsTo(namedDecl(
+                                                 matchesName("cuda.*|cublas.*")))))
+                                .bind("cudaParamDeclPtr"),
+                                Callback);
+  Finder.addMatcher(expr(isExpansionInMainFile(),
+                         sizeOfExpr(hasArgumentOfType(recordType(hasDeclaration(
+                             cxxRecordDecl(matchesName("cuda.*|cublas.*")))))))
+                        .bind("cudaStructSizeOf"),
+                         Callback);
+  Finder.addMatcher(stringLiteral(isExpansionInMainFile()).bind("stringLiteral"),
+                                  Callback);
+  Finder.addMatcher(varDecl(isExpansionInMainFile(), allOf(
+                            hasAttr(attr::CUDAShared),
+                            hasType(incompleteArrayType())))
+                           .bind("cudaSharedIncompleteArrayVar"),
+                            Callback);
+}
+
+void printStats(std::string fileSource, HipifyPPCallbacks &PPCallbacks, Cuda2HipCallback &Callback) {
+  int64_t sum = 0;
+  for (int i = 0; i < CONV_LAST; i++) {
+    sum += Callback.countReps[i] + PPCallbacks.countReps[i];
+  }
+  llvm::outs() << "info: converted " << sum << " CUDA->HIP refs ( ";
+  for (int i = 0; i < CONV_LAST; i++) {
+    llvm::outs() << counterNames[i] << ':' << Callback.countReps[i] + PPCallbacks.countReps[i] << ' ';
+  }
+  llvm::outs() << "), by APIs ( ";
+  for (int i = 0; i < API_LAST; i++) {
+    llvm::outs() << apiNames[i] << ':' << Callback.countApiReps[i] + PPCallbacks.countApiReps[i] << ' ';
+  }
+  llvm::outs() << ") in \'" << fileSource << "\'\n";
+}
 
 int main(int argc, const char **argv) {
-
   llvm::sys::PrintStackTraceOnErrorSignal();
-
-  int Result;
-
   CommonOptionsParser OptionsParser(argc, argv, ToolTemplateCategory, llvm::cl::Required);
-
   std::vector<std::string> fileSources = OptionsParser.getSourcePathList();
-
   std::string dst = OutputFilename;
   if (dst.empty()) {
     dst = fileSources[0];
@@ -1651,7 +1807,6 @@ int main(int argc, const char **argv) {
     }
     dst += ".cu";
   }
-
   // copy source file since tooling makes changes "inplace"
   std::ifstream source(fileSources[0], std::ios::binary);
   std::ofstream dest(Inplace ? dst + ".prehip" : dst, std::ios::binary);
@@ -1664,84 +1819,19 @@ int main(int argc, const char **argv) {
   HipifyPPCallbacks PPCallbacks(&Tool.getReplacements());
   Cuda2HipCallback Callback(&Tool.getReplacements(), &Finder, &PPCallbacks);
 
-  Finder.addMatcher(callExpr(isExpansionInMainFile(),
-                             callee(functionDecl(matchesName("cuda.*|cublas.*"))))
-                             .bind("cudaCall"),
-                             &Callback);
-  Finder.addMatcher(cudaKernelCallExpr().bind("cudaLaunchKernel"), &Callback);
-  Finder.addMatcher(memberExpr(isExpansionInMainFile(),
-                               hasObjectExpression(hasType(cxxRecordDecl(
-                               matchesName("__cuda_builtin_")))))
-                               .bind("cudaBuiltin"),
-                               &Callback);
-  Finder.addMatcher(declRefExpr(isExpansionInMainFile(),
-                                to(enumConstantDecl(
-                                matchesName("cuda.*|cublas.*|CUDA.*|CUBLAS*"))))
-                                .bind("cudaEnumConstantRef"),
-                                &Callback);
-  Finder.addMatcher(varDecl(isExpansionInMainFile(),
-                            hasType(enumDecl()))
-                            .bind("cudaEnumConstantDecl"),
-                            &Callback);
-  Finder.addMatcher(varDecl(isExpansionInMainFile(),
-                            hasType(typedefDecl(matchesName("cuda.*|cublas.*"))))
-                            .bind("cudaTypedefVar"),
-                            &Callback);
-  // Array of elements of typedef type, Example: cudaStream_t streams[2];
-  Finder.addMatcher(varDecl(isExpansionInMainFile(),
-                            hasType(arrayType(hasElementType(typedefType(
-                            hasDeclaration(typedefDecl(matchesName("cuda.*|cublas.*"))))))))
-                            .bind("cudaTypedefVar"),
-                            &Callback);
-  Finder.addMatcher(varDecl(isExpansionInMainFile(),
-                            hasType(cxxRecordDecl(matchesName("cuda.*|cublas.*"))))
-                            .bind("cudaStructVar"),
-                            &Callback);
-  Finder.addMatcher(varDecl(isExpansionInMainFile(),
-                            hasType(pointsTo(cxxRecordDecl(
-                                             matchesName("cuda.*|cublas.*")))))
-                            .bind("cudaStructVarPtr"),
-                            &Callback);
-  Finder.addMatcher(parmVarDecl(isExpansionInMainFile(),
-                                hasType(namedDecl(matchesName("cuda.*|cublas.*"))))
-                                .bind("cudaParamDecl"),
-                                &Callback);
-  Finder.addMatcher(parmVarDecl(isExpansionInMainFile(),
-                                hasType(pointsTo(namedDecl(
-                                                 matchesName("cuda.*|cublas.*")))))
-                                .bind("cudaParamDeclPtr"),
-                                &Callback);
-  Finder.addMatcher(expr(isExpansionInMainFile(),
-                         sizeOfExpr(hasArgumentOfType(recordType(hasDeclaration(
-                             cxxRecordDecl(matchesName("cuda.*|cublas.*")))))))
-                        .bind("cudaStructSizeOf"),
-                         &Callback);
-  Finder.addMatcher(stringLiteral(isExpansionInMainFile()).bind("stringLiteral"),
-                                  &Callback);
-  Finder.addMatcher(varDecl(isExpansionInMainFile(), allOf(
-                            hasAttr(attr::CUDAShared),
-                            hasType(incompleteArrayType())))
-                           .bind("cudaSharedIncompleteArrayVar"),
-                            &Callback);
+  addAllMatchers(Finder, &Callback);
 
   auto action = newFrontendActionFactory(&Finder, &PPCallbacks);
-
-  std::vector<const char *> compilationStages;
+  std::vector<const char*> compilationStages;
   compilationStages.push_back("--cuda-host-only");
-
-  for (auto Stage : compilationStages) {
-    Tool.appendArgumentsAdjuster(
-        getInsertArgumentAdjuster(Stage, ArgumentInsertPosition::BEGIN));
-    Tool.appendArgumentsAdjuster(getInsertArgumentAdjuster("-std=c++11"));
+  Tool.appendArgumentsAdjuster(getInsertArgumentAdjuster(compilationStages[0], ArgumentInsertPosition::BEGIN));
+  Tool.appendArgumentsAdjuster(getInsertArgumentAdjuster("-std=c++11"));
 #if defined(HIPIFY_CLANG_RES)
-    Tool.appendArgumentsAdjuster(
-        getInsertArgumentAdjuster("-resource-dir=" HIPIFY_CLANG_RES));
+  Tool.appendArgumentsAdjuster(getInsertArgumentAdjuster("-resource-dir=" HIPIFY_CLANG_RES));
 #endif
-    Tool.appendArgumentsAdjuster(getClangSyntaxOnlyAdjuster());
-    Result = Tool.run(action.get());
-
-    Tool.clearArgumentsAdjusters();
-  }
+  Tool.appendArgumentsAdjuster(getClangSyntaxOnlyAdjuster());
+  int Result = Tool.run(action.get());
+  Tool.clearArgumentsAdjusters();
 
   LangOptions DefaultLangOptions;
   IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts = new DiagnosticOptions();
@@ -1749,13 +1839,13 @@ int main(int argc, const char **argv) {
   DiagnosticsEngine Diagnostics(
       IntrusiveRefCntPtr<DiagnosticIDs>(new DiagnosticIDs()), &*DiagOpts,
       &DiagnosticPrinter, false);
-  SourceManager Sources(Diagnostics, Tool.getFiles());
 
   DEBUG(dbgs() << "Replacements collected by the tool:\n");
   for (const auto &r : Tool.getReplacements()) {
     DEBUG(dbgs() << r.toString() << "\n");
   }
 
+  SourceManager Sources(Diagnostics, Tool.getFiles());
   Rewriter Rewrite(Sources, DefaultLangOptions);
 
   if (!Tool.applyAllReplacements(Rewrite)) {
@@ -1771,16 +1861,8 @@ int main(int argc, const char **argv) {
     }
   }
   if (PrintStats) {
-    int64_t sum = 0;
-    for (int i = 0; i < ConvTypes::CONV_LAST; i++) {
-      sum += Callback.countReps[i] + PPCallbacks.countReps[i];
-    }
-    llvm::outs() << "info: converted " << sum << " CUDA->HIP refs ( ";
-    for (int i = 0; i < ConvTypes::CONV_LAST; i++) {
-      llvm::outs() << counterNames[i] << ':'
-                   << Callback.countReps[i] + PPCallbacks.countReps[i] << ' ';
-    }
-    llvm::outs() << ") in \'" << fileSources[0] << "\'\n";
+    printStats(fileSources[0], PPCallbacks, Callback);
   }
+
   return Result;
 }
diff --git a/docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md b/docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md
index 189f172c95..2d589ec415 100644
--- a/docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md
+++ b/docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md
@@ -42,12 +42,12 @@
 |   **CUDA**                                                |   **HIP**                     | **CUDA description**                                                                                                           |
 |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------|
 | `cudaStreamAddCallback`                                   |                               | Add a callback to a compute stream.                                                                                            |
-| `cudaStreamAttachMemAsync`                                |                               | Attach memory to a stream asynchronously.                                                                                      |
+| `cudaStreamAttachMemAsync`                                |                               | Attach managed memory to a stream asynchronously.                                                                                      |
 | `cudaStreamCreate`                                        | `hipStreamCreate`             | Create an asynchronous stream.                                                                                                 |
 | `cudaStreamCreateWithFlags`                               | `hipStreamCreateWithFlags`    | Create an asynchronous stream.                                                                                                 |
 | `cudaStreamCreateWithPriority`                            |                               | Create an asynchronous stream with the specified priority.                                                                     |
 | `cudaStreamDestroy`                                       | `hipStreamDestroy`            | Destroys and cleans up an asynchronous stream.                                                                                 |
-| `cudaStreamGetFlags`                                      |                               | Query the flags of a stream.                                                                                                   |
+| `cudaStreamGetFlags`                                      | `hipStreamGetFlags`           | Query the flags of a stream.                                                                                                   |
 | `cudaStreamGetPriority`                                   |                               | Query the priority of a stream.                                                                                                |
 | `cudaStreamQuery`                                         |                               | Queries an asynchronous stream for completion status.                                                                          |
 | `cudaStreamSynchronize`                                   | `hipStreamSynchronize`        | Waits for stream tasks to complete.                                                                                            |
@@ -100,8 +100,8 @@
 | `cudaHostAlloc`                                           | `hipHostMalloc`                | Allocates page-locked memory on the host.                                                                                      |
 | `cudaHostGetDevicePointer`                                | `hipHostGetDevicePointer`    | Passes back device pointer of mapped host memory allocated by cudaHostAlloc or registered by cudaHostRegister.                 |
 | `cudaHostGetFlags`                                        | `hipHostGetFlags`           | Passes back flags used to allocate pinned host memory allocated by cudaHostAlloc.                                              |
-| `cudaHostRegister`                                        |                               | Registers an existing host memory range for use by CUDA.                                                                       |
-| `cudaHostUnregister`                                      |                               | Unregisters a memory range that was registered with cudaHostRegister.                                                          |
+| `cudaHostRegister`                                        | `hipHostRegister`             | Registers an existing host memory range for use by CUDA.                                                                       |
+| `cudaHostUnregister`                                      | `hipHostUnregister`           | Unregisters a memory range that was registered with cudaHostRegister.                                                          |
 | `cudaMalloc`                                              | `hipMalloc`                   | Allocate memory on the device.                                                                                                 |
 | `cudaMalloc3D`                                            |                               | Allocates logical 1D, 2D, or 3D memory objects on the device.                                                                  |
 | `cudaMalloc3DArray`                                       |                               | Allocate an array on the device.                                                                                               |
@@ -231,7 +231,7 @@
 
 |   **CUDA**                                                |   **HIP**                     | **CUDA description**                                                                                                           |
 |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------|
-| `cudaBindSurfaceToArra`y                                  |                               | Binds an array to a surface.                                                                                                   |
+| `cudaBindSurfaceToArray`                                  |                               | Binds an array to a surface.                                                                                                   |
 | `cudaBindTexture`                                         |                               | Binds a memory area to a texture.                                                                                              |
 | `cudaBindTexture2D`                                       |                               | Binds a 2D memory area to a texture.                                                                                           |
 | `cudaBindTextureToArray`                                  |                               | Binds an array to a texture.                                                                                                   |
diff --git a/docs/markdown/hip_faq.md b/docs/markdown/hip_faq.md
index 8b50a9a8c7..d62f40510a 100644
--- a/docs/markdown/hip_faq.md
+++ b/docs/markdown/hip_faq.md
@@ -32,36 +32,52 @@
 HIP provides the following:
 - Devices (hipSetDevice(), hipGetDeviceProperties(), etc.)
 - Memory management (hipMalloc(), hipMemcpy(), hipFree(), etc.)
-- Streams (hipStreamCreate(), etc.)
+- Streams (hipStreamCreate(),hipStreamSynchronize(), hipStreamWaitEvent(),  etc.)
 - Events (hipEventRecord(), hipEventElapsedTime(), etc.)
 - Kernel launching (hipLaunchKernel is a standard C/C++ function that replaces <<< >>>)
+- HIP Module API to control when adn how code is loaded.
 - CUDA-style kernel coordinate functions (threadIdx, blockIdx, blockDim, gridDim)
+- Cross-lane instructions including shfl, ballot, any, all
 - Most device-side math built-ins
 - Error reporting (hipGetLastError(), hipGetErrorString())
 
 The HIP API documentation describes each API and its limitations, if any, compared with the equivalent CUDA API.
 
 ### What is not supported?
-#### Run-time features
+#### Runtime/Driver API features
+At a high-level, the following features are not supported:
 - Textures 
-- MemcpyToSymbol functions
 - Dynamic parallelism (CUDA 5.0)
 - Managed memory (CUDA 6.5)
 - Graphics interoperation with OpenGL or Direct3D
+- CUDA Driver API (Under Development)
+- CUDA IPC Functions (Under Development)
+
 - CUDA array, mipmappedArray and pitched memory
-- CUDA Driver API
+- MemcpyToSymbol functions
+- Queue priority controls
+
+See the [API Support Table](CUDA_Runtime_API_functions_supported_by_HIP.md) for more detailed information.
 
 #### Kernel language features
 - Device-side dynamic memory allocations (malloc, free, new, delete) (CUDA 4.0)
 - Virtual functions, indirect functions and try/catch (CUDA 4.0)
 - `__prof_trigger` 
-- PTX assembly (CUDA 4.0)
-- Several kernel features are under development.  See the [HIP Kernel Language](hip_kernel_language.md) for more information.
+- PTX assembly (CUDA 4.0).  HCC supports inline GCN assembly.
+- Several kernel features are under development.  See the [HIP Kernel Language](hip_kernel_language.md) for more information.  These include:
+  - printf
+  - assert
+  - `__restrict__`
+  - `__launch_bounds__`
+  - `__threadfence*_`, `__syncthreads*`
+  - Unbounded loop unroll
+
+
 
 ### Is HIP a drop-in replacement for CUDA?
 No. HIP provides porting tools which do most of the work do convert CUDA code into portable C++ code that uses the HIP APIs.
 Most developers will port their code from CUDA to HIP and then maintain the HIP version. 
-HIP code provides the same performance as coding in native CUDA, plus the benefit that the code can also run on AMD platforms.
+HIP code provides the same performance as native CUDA code, plus the benefits of running on AMD platforms.
 
 ### What specific version of CUDA does HIP support?
 HIP APIs and features do not map to a specific CUDA version.  HIP provides a strong subset of functionality provided in CUDA, and the hipify tools can 
diff --git a/docs/markdown/hip_kernel_language.md b/docs/markdown/hip_kernel_language.md
index 88f0237706..ba52b88433 100644
--- a/docs/markdown/hip_kernel_language.md
+++ b/docs/markdown/hip_kernel_language.md
@@ -156,7 +156,7 @@ The `__constant__` keyword is supported. The host writes constant memory before
 ### `__shared__` 
 The `__shared__` keyword is supported.
 
-`extern __shared__` allows the host to dynamically allocate shared memory and is specified as a launch parameter. This feature is under development.
+`extern __shared__` allows the host to dynamically allocate shared memory and is specified as a launch parameter.  HIP uses an alternate syntax based on the HIP_DYNAMIC_SHARED macro.
 
 ### `__managed__`
 Managed memory, including the `__managed__` keyword, are not supported in HIP.
@@ -537,7 +537,6 @@ HIP supports the following atomic operations.
 ### Caveats and Features Under-Development:
 
 - HIP enables atomic operations on 32-bit integers. Additionally, it supports an atomic float add. AMD hardware, however, implements the float add using a CAS loop, so this function may not perform efficiently.
-- wrapping increment and decrement are under development.
 
 ## Warp Cross-Lane Functions
 
@@ -573,8 +572,6 @@ Applications can test whether the target platform supports the any/all instructi
 
 ### Warp Shuffle Functions 
 
-The following warp shuffle instructions are under development.
-
 Half-float shuffles are not supported. The default width is warpSize---see [Warp Cross-Lane Functions](#warp-cross-lane-functions). Applications should not assume the warpSize is 32 or 64.
 
 ```
@@ -670,3 +667,22 @@ The following C++ features are not supported:
 - Run-time-type information (RTTI)
 - Virtual functions
 - Try/catch
+
+## Kernel Compilation
+HIP now supports compiling C++/HIP kernels to binary. Eventhough HIP does not support fatbinary (yet), the user can specify the target for which the binary can be generated. The file format for binary is `.co` which means Code Object. The following command builds the binary using `hipcc`.
+
+`hipcc --genisa --target-isa=[TARGET GPU] [INPUT FILE] -o [OUTPUT FILE]`
+```[TARGET GPU] = fiji/hawaii
+[INPUT FILE] = Name of the file containing kernels
+[OUTPUT FILE] = Name of the generated code object file```
+
+Note that the kernel file should have `int main(){}` at the end it so that the binary is generated. This happens because HCC generates binaries at linking time rather than compilation
+
+You need 3 things to run kernel in binary.
+1. Kernel Binary
+2. Name of kernel binary
+3. Name of the kernel
+
+We already got first two of them. In order to get name of the kernel, try `objdump -x [OUTPUT FILE]`. OUTPUT FILE is file generated by hipcc during kernel compilation. The output from objdump has symbol to the kernel whose name is mangled with `grid_launch_parm`, `__functor`, `__cxxamp_trampoline`. An example of how it looks is `ZN12_GLOBAL__N_137_Z3Cpy16grid_launch_parmPfS0__functor19__cxxamp_trampolineEiiiiiiPKfPf` where `Cpy` is the name of the kernel written in C++.
+
+
diff --git a/docs/markdown/hip_performance.md b/docs/markdown/hip_performance.md
new file mode 100644
index 0000000000..98197b3db7
--- /dev/null
+++ b/docs/markdown/hip_performance.md
@@ -0,0 +1,42 @@
+# HIP Performance Optimizations
+
+Please note that this document lists possible ways for experimenting with HIP stack to gain performance. Performance may vary from platform to platform.
+
+### Unpinned Memory Transfer Optimizations
+ 
+#### On Small BAR Setup
+
+There are two possible ways to transfer data from Host to Device (H2D) and Device to Host(D2H)
+ * Using Staging Buffers
+ * Using PinInPlace
+
+#### On Large BAR Setup
+
+There are two possible ways to transfer data from Host to Device (H2D)
+ * Using Staging Buffers
+ * Using PinInPlace
+ * Direct Memcpy
+ 
+ And there are two possible ways to transfer data from Device to Host (D2H)
+ * Using Staging Buffers
+ * Using PinInPlace
+ 
+Some GPUs may not be able to directly access host memory, and in these cases we need to
+stage the copy through an optimized pinned staging buffer, to implement H2D and D2H copies.The copy is broken into buffer-sized chunks to limit the size of the buffer and also to provide better performance by overlapping the CPU copies with the DMA copies.
+
+PinInPlace is another algorithm which pins the host memory "in-place", and copies it with the DMA
+engine.
+
+By default staging buffers are used for unpinned memory transfers, however other ways can be used by enabling few environment variables (so no need to build the code again!!!)
+
+Following environment variables can be used:
+
+- HIP_PININPLACE - This environment variable forces the use of PinInPlace logic for all unpinned memory copies
+
+- HIP_OPTIMAL_MEM_TRANSFER- This environment variable enables a hybrid memory copy logic based on thresholds. These thresholds can be managed with following environment variables:
+  -   HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE - Threshold in bytes for H2D copy. For sizes smaller than threshold staging buffers logic would be used else PinInPlace logic.
+  -   HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING - Threshold in bytes for H2D copy. For sizes smaller than threshold direct copy logic would be used else staging buffers logic.
+  -   HIP_D2H_MEM_TRANSFER_THRESHOLD - Threshold in bytes for D2H copy. For sizes smaller than threshold staging buffer logic would be used else PinInPlace logic.
+
+
+
diff --git a/docs/markdown/hip_porting_driver_api.md b/docs/markdown/hip_porting_driver_api.md
new file mode 100644
index 0000000000..b0ac3ecf1d
--- /dev/null
+++ b/docs/markdown/hip_porting_driver_api.md
@@ -0,0 +1,257 @@
+# Porting CUDA Driver API
+
+## Introduction to the CUDA Driver and Runtime APIs
+CUDA provides a separate CUDA Driver and Runtime APIs.  The two APis have significant overlap in functionality:  
+- Both APIs support events, streams, memory management, memory copy, and error handling.
+- Both APIs deliver similar performance.  
+- Driver APIs calls begin with the prefix `cu` while Runtime APIs begin with the prefix `cuda`.  For example, the Driver API API contains `cuEventCreate` while the Runtime API contains `cudaEventCreate`, with similar functionality.
+- The Driver API defines a different but largely overlapping error code code space than the Runtime API, and uses a different coding convention.  For example, Driver API defines `CUDA_ERROR_INVALID_VALUE` while the Runtime API defines `cudaErrorInvalidValue`
+
+
+The Driver API offers two additional pieces of functionality not provided by the Runtime API: cuModule and cuCtx APIs.
+
+### cuModule API
+The Module section of the Driver API provides additional control over how and when accelerator
+ code objects are loaded.
+For example, the driver API allows code objects to be loaded from files or memory pointers.
+Symbols for kernels or global data can be extracted from the loaded code objects.
+In contrast, the Runtime API automatically loads and (if necessary) compiles all of the 
+kernels from a executable binary when run.
+In this mode, NVCC must be used to compile kernel code so the automatic loading can function correctly.
+
+Both Driver and Runtime APIs define a function for launching kernels (called `cuLaunchKernel` or `cudaLaunchKernel`.
+The kernel arguments and the execution configuration (grid dimensions, group dimensions, dynamic shared memory, and stream) are passed as arguments to the launch function.
+The Runtime additionally provides the `<<< >>>` syntax for launching kernels, which resembles
+a special function call and is easier to use than explicit launch API (in particular with respect to
+handling of kernel arguments).
+However, this syntax is not standard C++ and is available only when NVCC is used to compile the host code.  
+
+The Module features are useful in an environment which generate the code objects directly, such as a new
+accelerator language front-end.  Here, NVCC is not used.   Instead, the environment may have a 
+different kernel language or different compilation flow.
+Other environments have many kernels and do not want them to be all loaded automatically.
+The Module functions can be used to load the generated code objects and launch kernels.
+As we will see below, HIP defines a Module API which provides similar explict control over code
+object managemenet.
+
+### cuCtx API
+The Driver API defines "Context" and "Devices" as separate entities.  
+Contexts contain a single device, and a device can theoretically have multiple contexts.  
+Each context contains a set of streams and events specific to the context.  
+Historically contexts also defined a unique address space for the GPU, though this may not longer be the case in Unified Memory platforms (since the CPU and all the devices in the same process share a single unified address space).  
+The Context APIs also provide a mechanism to switch between devices, which allowed
+a single CPU thread to send commands to different GPUs.  HIP as well as a recent versions
+of CUDA Runtime provide other mechanisms to accomplish this feat - for example using streams or
+`cudaSetDevice`.
+
+The CUDA Runtime API unifies the Context API with the Device API.  This simplifies the APIs
+and has little loss of functionality since each Context can contain a single device, 
+and the benefits of multiple contexts has been replaced with other interfaces.   HIP provides
+a context API to facilitate easy porting from existing Driver codes.  
+In HIP, the Ctx functions largely provide an alternate syntax for changing the active device. 
+Most new applications will prefer to use `hipSetDevice` or the stream APIs.
+
+## HIP Module and Ctx APIs
+
+Rather than present two separate APIs, HIP extends the HIP API with new APIs for Modules 
+and Ctx control.   
+
+### hipModule API
+
+Like the CUDA Driver API, the Module API provides additional control 
+over how code is loaded, including options to load code from files or from in-memory pointers.  
+NVCC and HCC target different architectures and use different code object formats : NVCC
+is `cubin` or `ptx` files, while the HCC path is the `hsaco` format.   The external
+compilers which generate these code objects are responsible for generating and loading
+the correct code object for each platform.  Notably, there is not a fat binary format that
+can contain code for both NVCC and HCC platforms.     The following table summarizes the
+formats used on each platform:
+
+| Format        | APIs                         | NVCC               | HCC               |
+| ---           | ---                          | ---                | ---               |
+| Code Object   | hipModuleLoad, hipModuleLoadData | .cubin or PTX text | .hsaco            |
+| Fat Binary    | hipModuleLoadFatBin          | .fatbin            | Under Development | 
+
+hipcc uses NVCC and HCC to compile host codes.  Both of these may embed code objects
+into the final executable, and these code objects will be automatically loaded when
+the application starts.
+The hipModule API can be used to load additional code objects, and in this way
+provides an extended capability to the automatically loaded code objects.  HCC allows
+both of these capabilities to be used together, if desired.  Of course it is possible
+to create a program with no kernels and thus no automatic loading.
+
+
+### hipCtx API
+HIP provides a `Ctx` API as a thin layer over the existing Device functions.  This Ctx API
+can be used to set the current context, or to query properties of the device associated with
+the context.  The current context is implicitly used by other APIs such as `hipStreamCreate`.
+
+### hipify translation of CUDA Driver API
+The hipify tool will convert CUDA Driver APIs for streams, events, memory management to 
+the equivalent HIP driver calls.   For example, `cuEventCreate` will be translated to 
+`hipEventCreate`.  Hipify also converts error code from the Driver namespace and coding
+convention to the equivalent HIP error code.  Thus, HIP unifies the APis for these common functions.
+[hipify support for translating driver API is Under Development]
+
+The memory copy APIs require additional explanation.  The CUDA driver includes the memory
+direction in the name of the API (ie `cuMemcpyH2D`) while the CUDA driver API provides
+a single memory copy API with a parameter that specifies the direction and additionally
+supports a "default" direction where the runtime determines the direction automatically.
+HIP provides APis with both styles: for example, `hipMemcpyH2D` as well as `hipMemcpy`.
+The first flavor may be faster in some cases since they avoid host overhead to detect the 
+different memory directions.
+
+HIP defines a single error space, and uses camel-case for all errors (i.e. `hipErrorInvalidValue`).  
+
+### HCC Implementation Notes
+#### .hsaco
+The .hsaco format used by HCC is described in more detail [here](https://github.com/RadeonOpenCompute/ROCm-ComputeABI-Doc).
+An example and blog that show how to use the format is [here](http://gpuopen.com/rocm-with-harmony-combining-opencl-hcc-hsa-in-a-single-program).  hsaco can be generated by hcc + extractkernel tool,
+cloc, the GCN assembler, or other tools.
+
+#### Address Spaces
+HCC defines a process-wide address space where the CPU and all devices allocate
+addresses from a single unified pool.  Thus addresses may be shared between contexts, and 
+unlike the original CUDA definition a new context does not create a new address space for
+the device.   
+
+#### Using hipModuleLaunchKernel
+`hipModuleLaunchKernel` is `cuLaunchKernel` in HIP world. It takes the same arguments as `cuLaunchKernel`. The argument `kernelParams` is not fully implemented for HCC. The workaround for it is, to use platform specific macros for each target. Or, `extra` argument can be used which works on both the platforms.
+
+#### Additional Information
+- HCC allocates staging buffers (used for unpinned copies) on a per-device basis.
+- HCC creates a primary context when the HIP API is called.  So in a pure driver API code, HIP/HCC will create a primary context while HIP/NVCC will have empty context stack.  HIP/HCC will push primary context to context stack when it is empty. This can have subtle differences on applications which mix the runtime and driver APIs.
+
+### NVCC Implementation Notes
+
+#### Interoperation between HIP and CUDA Driver
+CUDA applications may want to mix CUDA driver code with HIP code (see example below).  This
+table shows the type equivalence to enable this interaction.
+
+|**HIP Type** |**CU Driver Type**|  **CUDA Runtime Type** |
+| ----        | ----             | ---- | 
+| hipModule   |  CUmodule        |   			|
+| hipFunction |  CUfunction      | 			|
+| hipCtx_t    |  CUcontext       | 			|
+| hipDevice_t |  CUdevice        |  			|
+| hipStream_t |  CUstream        | cudaStream_t         |
+| hipEvent_t  |  CUevent         | cudaEvent_t          |
+| hipArray    |  CUarray         | cudaArray            |
+
+
+
+#### Compilation Flags
+The hipModule interface does not support the hipModuleLoadEx function, which is used to control PTX compilaton options.
+HCC does not use PTX and does not support the same compilation options.
+In fact, HCC code objects always contain fully compiled ISA and do not require additional compilation as part of the load step.
+Code which requires this functionaly should use platform-specific coding, calling `cuModuleLoadEx` 
+on the NVCC path and hipModuleLoad on the hcc path.  For example:
+
+```
+hipModule module;
+void *imagePtr = ... ;  // Somehow populate data pointer with code object
+
+#ifdef __HIP_PLATFORM_NVCC__
+// Use CUDA driver API but write to hipModule since they are same type:
+const int numOptions = 1;
+CUJit_option options[numOptions];
+void * optionValues[numOptions];
+
+options[0] = CU_JIT_MAX_REGISTERS;
+unsigned maxRegs=15;
+optionValues[0] = (void*) (&maxRegs);
+
+cuModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues);
+
+#else // __HIP_PLATFORM_HCC__
+
+// HCC path does not support or require JIT options, so just load the module.
+hipModuleLoadData(&module, imagePtr);
+
+#endif
+
+// Back to unified code - both paths above loaded the "module" variable.
+hipFunction k;
+hipModuleGetFunction(&k, module, "myKernel");
+```
+
+The below sample shows how to use `hipModuleGetFunction`.
+
+```
+#include<hip_runtime.h>
+#include<hip_runtime_api.h>
+#include<iostream>
+#include<fstream>
+#include<vector>
+
+#define LEN 64
+#define SIZE LEN<<2
+
+#ifdef __HIP_PLATFORM_HCC__
+#define fileName "vcpy_isa.co"
+#endif
+
+#ifdef __HIP_PLATFORM_NVCC__
+#define fileName "vcpy_isa.ptx"
+#endif
+
+#define kernel_name "hello_world"
+
+int main(){
+    float *A, *B;
+    hipDeviceptr_t Ad, Bd;
+    A = new float[LEN];
+    B = new float[LEN];
+
+    for(uint32_t i=0;i<LEN;i++){
+        A[i] = i*1.0f;
+        B[i] = 0.0f;
+        std::cout<<A[i] << " "<<B[i]<<std::endl;
+    }
+
+
+#ifdef __HIP_PLATFORM_NVCC__
+          hipInit(0);
+          hipDevice_t device;
+          hipCtx_t context;
+          hipDeviceGet(&device, 0);
+          hipCtxCreate(&context, 0, device);
+#endif
+
+    hipMalloc((void**)&Ad, SIZE);
+    hipMalloc((void**)&Bd, SIZE);
+
+    hipMemcpyHtoD(Ad, A, SIZE);
+    hipMemcpyHtoD(Bd, B, SIZE);
+    hipModule_t Module;
+    hipFunction_t Function;
+    hipModuleLoad(&Module, fileName);
+    hipModuleGetFunction(&Function, Module, kernel_name);
+
+    std::vector<void*>argBuffer(2);
+    memcpy(&argBuffer[0], &Ad, sizeof(void*));
+    memcpy(&argBuffer[1], &Bd, sizeof(void*));
+
+    size_t size = argBuffer.size()*sizeof(void*);
+
+    void *config[] = {
+      HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0],
+      HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
+      HIP_LAUNCH_PARAM_END
+    };
+
+    hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&con                                     fig);
+
+    hipMemcpyDtoH(B, Bd, SIZE);
+    for(uint32_t i=0;i<LEN;i++){
+        std::cout<<A[i]<<" - "<<B[i]<<std::endl;
+    }
+
+#ifdef __HIP_PLATFORM_NVCC__
+          hipCtxDetach(context);
+#endif
+
+    return 0;
+}
+
+```
diff --git a/docs/markdown/hip_porting_guide.md b/docs/markdown/hip_porting_guide.md
index 4d376f4a5f..45e5455ea4 100644
--- a/docs/markdown/hip_porting_guide.md
+++ b/docs/markdown/hip_porting_guide.md
@@ -395,13 +395,53 @@ For new projects or ports which can be re-factored, we recommend the use of the
 This indicates that the code is standard C++ code, but also provides a unique indication for make tools to
 run hipcc when appropriate.
 
-### Workarounds
+## Workarounds
 
-#### warpSize
+### warpSize
 Code should not assume a warp size of 32 or 64.  See [Warp Cross-Lane Functions](hip_kernel_language.md#warp-cross-lane-functions) for information on how to write portable wave-aware code.
 
+## memcpyToSymbol
 
-#### Textures and Cache Control
+HIP support for hipMemCpyToSymbol is under-development.  This feature allows a kernel
+to define a device-side data symbol which can be accessed on the host side.  The symbol
+can be in __constant or device space.  As a workaround, programs can pass the symbol
+as an argument to the kernel, and use standard hipMemcpy routines to initialize it.
+
+For example:
+
+Device Code:
+```
+// Cuda Device Code
+__constant__ float Array[1024];
+__global__ void Inc(float *Out){
+   Int tx = hipThreadIdx_x;
+   Out[tx] = Array[tx] + 1;
+}
+ 
+// HIP Device Code
+__global__ void Inc(hipLaunchParm lp, float *Array, float *Out){
+   Int tx = hipThreadIdx_x;
+   Out[tx] = Array[tx] + 1;
+}
+```
+ 
+Host Code:
+```
+// CUDA Host Code
+cudaMemcpyToSymbol(Array, hostArray, sizeofArray);
+ 
+// HIP Host Code
+hipMemcpy(Array, hostArray, sizeofArray);
+```
+ 
+## threadfence_system
+Threadfence_system makes all device memory writes, all writes to mapped host memory, and all writes to peer memory visible to CPU and other GPU devices.
+Some implementations can provide this behavior by flushing the GPU L2 cache.
+HIP/HCC does not provide this functionality.  As a workaround, users can set the environment variable `HSA_DISABLE_CACHE=1` to 
+disable the GPU L2 cache. This will affect all accesses and for all kernels and so may have 
+a performance impact.
+
+### Textures and Cache Control
 
 >Texture support is under-development and not yet supported by HIP.
 
diff --git a/include/hcc_detail/hipComplex.h b/include/hcc_detail/hipComplex.h
new file mode 100644
index 0000000000..910cee946d
--- /dev/null
+++ b/include/hcc_detail/hipComplex.h
@@ -0,0 +1,172 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#ifndef HIPCOMPLEX_H
+#define HIPCOMPLEX_H
+
+typedef struct{
+    float x;
+    float y;
+}hipFloatComplex;
+
+__device__ static inline float hipCrealf(hipFloatComplex z){
+    return z.x;
+}
+
+__device__ static inline float hipCimagf(hipFloatComplex z){
+    return z.y;
+}
+
+__device__ static inline hipFloatComplex make_hipFloatComplex(float a, float b){
+    hipFloatComplex z;
+    z.x = a;
+    z.y = b;
+    return z;
+}
+
+__device__ static inline hipFloatComplex hipConjf(hipFloatComplex z){
+    hipFloatComplex ret;
+    ret.x = z.x;
+    ret.y = -z.y;
+    return ret;
+}
+
+__device__ static inline float hipCsqabsf(hipFloatComplex z){
+    return z.x * z.x + z.y * z.y;
+}
+
+__device__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q){
+    return make_hipFloatComplex(p.x + q.x, p.y + q.y);
+}
+
+__device__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q){
+    return make_hipFloatComplex(p.x - q.x, p.y - q.y);
+}
+
+__device__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q){
+    return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
+}
+
+__device__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q){
+    float sqabs = hipCsqabsf(q);
+    hipFloatComplex ret;
+    ret.x = (p.x * q.x + p.y * q.y)/sqabs;
+    ret.y = (p.y * q.x - p.x * q.y)/sqabs;
+    return ret;
+}
+
+__device__ static inline float hipCabsf(hipFloatComplex z){
+    return sqrtf(hipCsqabsf(z));
+}
+
+
+typedef struct{
+    double x;
+    double y;
+}hipDoubleComplex;
+
+__device__ static inline double hipCreal(hipDoubleComplex z){
+    return z.x;
+}
+
+__device__ static inline double hipCimag(hipDoubleComplex z){
+    return z.y;
+}
+
+__device__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b){
+    hipDoubleComplex z;
+    z.x = a;
+    z.y = b;
+    return z;
+}
+
+__device__ static inline hipDoubleComplex hipConj(hipDoubleComplex z){
+    hipDoubleComplex ret;
+    ret.x = z.x;
+    ret.y = z.y;
+    return ret;
+}
+
+__device__ static inline double hipCsqabs(hipDoubleComplex z){
+    return z.x * z.x + z.y * z.y;
+}
+
+__device__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q){
+    return make_hipDoubleComplex(p.x + q.x, p.y + q.y);
+}
+
+__device__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q){
+    return make_hipDoubleComplex(p.x - q.x, p.y - q.y);
+}
+
+__device__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q){
+    return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
+}
+
+__device__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q){
+    double sqabs = hipCsqabs(q);
+    hipDoubleComplex ret;
+    ret.x = (p.x * q.x + p.y * q.y)/sqabs;
+    ret.y = (p.y * q.x - p.x * q.y)/sqabs;
+    return ret;
+}
+
+__device__ static inline double hipCabs(hipDoubleComplex z){
+    return sqrtf(hipCsqabs(z));
+}
+
+typedef hipFloatComplex hipComplex;
+
+__device__ static inline hipComplex make_hipComplex(float x,
+                                            float y){
+    return make_hipFloatComplex(x, y);
+}
+
+__device__ static inline hipFloatComplex hipComplexDoubleToFloat
+(hipDoubleComplex z){
+    return make_hipFloatComplex((float)z.x, (float)z.y);
+}
+
+__device__ static inline hipDoubleComplex hipComplexFloatToDouble
+(hipFloatComplex z){
+    return make_hipDoubleComplex((double)z.x, (double)z.y);
+}
+
+__device__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r){
+    float real = (p.x * q.x) + r.x;
+    float imag = (q.x * p.y) + r.y;
+
+    real = -(p.y * q.y) + real;
+    imag = (p.x * q.y) + imag;
+
+    return make_hipComplex(real, imag);
+}
+
+__device__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q, hipDoubleComplex r){
+    float real = (p.x * q.x) + r.x;
+    float imag = (q.x * p.y) + r.y;
+
+    real = -(p.y * q.y) + real;
+    imag = (p.x * q.y) + imag;
+
+    return make_hipDoubleComplex(real, imag);
+}
+
+#endif
diff --git a/include/hcc_detail/hip_hcc.h b/include/hcc_detail/hip_hcc.h
index 2f0c33a0d2..01c36afde4 100644
--- a/include/hcc_detail/hip_hcc.h
+++ b/include/hcc_detail/hip_hcc.h
@@ -160,12 +160,17 @@ class ihipCtx_t;
 #endif
 
 
+// Just initialize the HIP runtime, but don't log any trace information.
+#define HIP_INIT()\
+	std::call_once(hip_initialized, ihipInit);\
+    ihipCtxStackUpdate();
+
 
 // This macro should be called at the beginning of every HIP API.
 // It initialies the hip runtime (exactly once), and
 // generate trace string that can be output to stderr or to ATP file.
 #define HIP_INIT_API(...) \
-	std::call_once(hip_initialized, ihipInit);\
+    HIP_INIT()\
     API_TRACE(__VA_ARGS__);
 
 #define ihipLogStatus(hipStatus) \
@@ -396,6 +401,22 @@ public:
 typedef ihipStreamCriticalBase_t<StreamMutex> ihipStreamCritical_t;
 typedef LockedAccessor<ihipStreamCritical_t> LockedAccessor_StreamCrit_t;
 
+class ihipModule_t{
+public:
+  hsa_executable_t executable;
+  hsa_code_object_t object;
+  std::string fileName;
+  void *ptr;
+  size_t size;
+};
+
+
+class ihipFunction_t{
+public:
+  hsa_executable_symbol_t kernel_symbol;
+  uint64_t kernel;
+};
+
 // Internal stream structure.
 class ihipStream_t {
 public:
@@ -404,8 +425,9 @@ typedef uint64_t SeqNum_t ;
     ~ihipStream_t();
 
     // kind is hipMemcpyKind
-    void copySync (LockedAccessor_StreamCrit_t &crit, void* dst, const void* src, size_t sizeBytes, unsigned kind);
-    void locked_copySync (void* dst, const void* src, size_t sizeBytes, unsigned kind);
+    void copySync (LockedAccessor_StreamCrit_t &crit, void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn = true);
+    void locked_copySync (void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn = true);
+
 
     void copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind);
 
@@ -423,7 +445,7 @@ typedef uint64_t SeqNum_t ;
     // Use this if we already have the stream critical data mutex:
     void                 wait(LockedAccessor_StreamCrit_t &crit, bool assertQueueEmpty=false);
 
-
+    void launchModuleKernel(hsa_signal_t signal, uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ, uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ, uint32_t sharedMemBytes, void *kernarg, size_t kernSize, uint64_t kernel);
 
     // Non-threadsafe accessors - must be protected by high-level stream lock with accessor passed to function.
     SIGSEQNUM            lastCopySeqId (LockedAccessor_StreamCrit_t &crit) const { return crit->_last_copy_signal ? crit->_last_copy_signal->_sigId : 0; };
@@ -443,6 +465,7 @@ public:
     hc::accelerator_view        _av;
     unsigned                    _flags;
 
+
 private:
     void     enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal, ihipSignal_t *completionSignal);
     void     waitCopy(LockedAccessor_StreamCrit_t &crit, ihipSignal_t *signal);
@@ -648,6 +671,8 @@ extern void ihipInit();
 extern const char *ihipErrorString(hipError_t);
 extern ihipCtx_t    *ihipGetTlsDefaultCtx();
 extern void          ihipSetTlsDefaultCtx(ihipCtx_t *ctx);
+extern hipError_t    ihipSynchronize(void);
+extern hipError_t    ihipCtxStackUpdate();
 
 extern ihipDevice_t *ihipGetDevice(int);
 ihipCtx_t * ihipGetPrimaryCtx(unsigned deviceIndex);
diff --git a/include/hcc_detail/hip_runtime.h b/include/hcc_detail/hip_runtime.h
index d009bec35b..727604d8d8 100644
--- a/include/hcc_detail/hip_runtime.h
+++ b/include/hcc_detail/hip_runtime.h
@@ -568,16 +568,66 @@ __device__ void  __threadfence_system(void);
 #define hipGridDim_y   (hc_get_num_groups(1))
 #define hipGridDim_z   (hc_get_num_groups(2))
 
+// loop unrolling
+__device__ static inline void* memcpy(void* dst, void* src, size_t size)
+{
+    uint64_t i = 0;
+    uint64_t totalLength = size/sizeof(uint32_t);
+    for(i=hipThreadIdx_x+hipBlockIdx_x*hipBlockDim_x;
+                  i<(totalLength/4);
+                  i = i + hipBlockDim_x * hipGridDim_x)
+    {
+        ((uint32_t*)dst)[4*i] = ((uint32_t*)src)[4*i];
+        ((uint32_t*)dst)[4*i+1] = ((uint32_t*)src)[4*i+1];
+        ((uint32_t*)dst)[4*i+2] = ((uint32_t*)src)[4*i+2];
+        ((uint32_t*)dst)[4*i+3] = ((uint32_t*)src)[4*i+3];
+    }
+    if(4*i < totalLength){
+        ((uint32_t*)dst)[4*i] = ((uint32_t*)src)[4*i];
+        ((uint32_t*)dst)[4*i+1] = ((uint32_t*)src)[4*i+1];
+        ((uint32_t*)dst)[4*i+2] = ((uint32_t*)src)[4*i+2];
+        ((uint32_t*)dst)[4*i+3] = ((uint32_t*)src)[4*i+3];
+
+    }
+    return nullptr;
+}
+
+__device__ static inline void* memset(void* ptr, uint8_t val, size_t size)
+{
+    uint32_t _val = 0;
+    _val = (val | val << 8 | val << 16 | val << 24);
+    uint64_t totalLength = size/sizeof(uint32_t);
+    uint64_t i = 0;
+    for(i=hipThreadIdx_x+hipBlockIdx_x*hipBlockDim_x;
+                  i<(totalLength/4);
+                  i = i + hipBlockDim_x * hipGridDim_x)
+    {
+        ((uint32_t*)ptr)[4*i] = _val;
+        ((uint32_t*)ptr)[4*i+1] = _val;
+        ((uint32_t*)ptr)[4*i+2] = _val;
+        ((uint32_t*)ptr)[4*i+3] = _val;
+    }
+    if(4*i < totalLength){
+        ((uint32_t*)ptr)[4*i] = _val;
+        ((uint32_t*)ptr)[4*i+1] = _val;
+        ((uint32_t*)ptr)[4*i+2] = _val;
+        ((uint32_t*)ptr)[4*i+3] = _val;
+
+    }
+    return nullptr;
+}
+
 #define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE)
 
 #define HIP_KERNEL_NAME(...) __VA_ARGS__
 
 #ifdef __HCC_CPP__
-hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm *lp);
-hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, grid_launch_parm *lp);
-hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, grid_launch_parm *lp);
-hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, grid_launch_parm *lp);
-void ihipPostLaunchKernel(hipStream_t stream, grid_launch_parm &lp);
+extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm *lp);
+extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, grid_launch_parm *lp);
+extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, grid_launch_parm *lp);
+extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, grid_launch_parm *lp);
+extern void ihipPrintKernelLaunch(const char *kernelName, const grid_launch_parm *lp, const hipStream_t stream);
+extern void ihipPostLaunchKernel(hipStream_t stream, grid_launch_parm &lp);
 
 // TODO - move to common header file.
 #define KNRM  "\x1B[0m"
@@ -589,10 +639,9 @@ do {\
   grid_launch_parm lp;\
   lp.dynamic_group_mem_bytes = _groupMemBytes; \
   hipStream_t trueStream = (ihipPreLaunchKernel(_stream, _numBlocks3D, _blockDim3D, &lp)); \
-    if (HIP_TRACE_API) {\
-        fprintf(stderr, KGRN "<<hip-api: hipLaunchKernel '%s' gridDim:(%d,%d,%d) groupDim:(%d,%d,%d) groupMem:+%d stream=%p\n" KNRM, \
-                #_kernelName, lp.grid_dim.x, lp.grid_dim.y, lp.grid_dim.z, lp.group_dim.x, lp.group_dim.y, lp.group_dim.z, lp.dynamic_group_mem_bytes, (void*)(_stream));\
-    }\
+  if (HIP_TRACE_API) {\
+      ihipPrintKernelLaunch(#_kernelName, &lp, _stream); \
+  }\
   _kernelName (lp, ##__VA_ARGS__);\
   ihipPostLaunchKernel(trueStream, lp);\
 } while(0)
@@ -653,4 +702,6 @@ do {\
  */
 
 
+
+
 #endif
diff --git a/include/hcc_detail/hip_runtime_api.h b/include/hcc_detail/hip_runtime_api.h
index 0316f73b57..fbcca3d12f 100644
--- a/include/hcc_detail/hip_runtime_api.h
+++ b/include/hcc_detail/hip_runtime_api.h
@@ -52,13 +52,13 @@ typedef struct ihipDevice_t *hipDevice_t;
 
 typedef struct ihipStream_t *hipStream_t;
 
-typedef uint64_t hipFunction;
+typedef struct ihipModule_t *hipModule_t;
 
-typedef uint64_t hipModule;
+typedef struct ihipFunction_t *hipFunction_t;
 
-typedef struct hipEvent_t {
-    struct ihipEvent_t *_handle;
-} hipEvent_t;
+typedef void* hipDeviceptr_t;
+
+typedef struct ihipEvent_t *hipEvent_t;
 
 
 /**
@@ -844,6 +844,14 @@ hipError_t hipHostFree(void* ptr);
  */
 hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind);
 
+hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t sizeBytes);
+
+hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t sizeBytes);
+
+hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes);
+
+hipError_t hipMemcpyHtoH(void* dst, void* src, size_t sizeBytes);
+
 
 /**
  *  @brief Copies @p sizeBytes bytes from the memory area pointed to by @p src to the memory area pointed to by @p offset bytes from the start of symbol @p symbol.
@@ -1054,32 +1062,173 @@ hipError_t hipMemcpyPeerAsync(void* dst, int dstDevice, const void* src, int src
 hipError_t hipInit(unsigned int flags) ;
 
 
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ *  @defgroup Context Management
+ *  @{
+ */
 
-// TODO-ctx
+/**
+ * @brief Create a context and set it as current/ default context
+ *
+ * @param [out] ctx
+ * @param [in] flags
+ * @param [in] associated device handle
+ *
+ * @returns #hipSuccess, #hipErrorInvalidContext
+ */
 hipError_t hipCtxCreate(hipCtx_t *ctx, unsigned int flags,  hipDevice_t device);
 
 hipError_t hipCtxDestroy(hipCtx_t ctx);
 
+/**
+ * @brief Pop the current/default context and return the popped context.
+ *
+ * @param [out] ctx
+ *
+ * @returns #hipSuccess
+ */
+
 hipError_t hipCtxPopCurrent(hipCtx_t* ctx);
 
+/**
+ * @brief Push the context to be set as current/ default context
+ *
+ * @param [in] ctx
+ *
+ * @returns #hipSuccess, #hipErrorInvalidContext
+ */
+
 hipError_t hipCtxPushCurrent(hipCtx_t ctx);
 
+/**
+ * @brief Set the passed context as current/default
+ *
+ * @param [in] ctx
+ *
+ * @returns #hipSuccess
+ */
+
 hipError_t hipCtxSetCurrent(hipCtx_t ctx);
 
+/**
+ * @brief Get the handle of the current/ default context
+ *
+ * @param [out] ctx
+ *
+ * @returns #hipSuccess
+ */
+
 hipError_t hipCtxGetCurrent(hipCtx_t* ctx);
 
+/**
+ * @brief Get the handle of the device associated with current/default context
+ *
+ * @param [out] device
+ *
+ * @returns #hipSuccess, #hipErrorInvalidContext
+ */
+
 hipError_t hipCtxGetDevice(hipDevice_t *device);
 
+/**
+ * @brief Returns the approximate HIP api version.
+ *
+ * @warning The HIP feature set does not correspond to an exact CUDA SDK api revision.
+ * This function always set *apiVersion to 4 as an approximation though HIP supports
+ * some features which were introduced in later CUDA SDK revisions.
+ * HIP apps code should not rely on the api revision number here and should
+ * use arch feature flags to test device capabilities or conditional compilation.
+ *
+ */
 hipError_t hipCtxGetApiVersion (hipCtx_t ctx,int *apiVersion);
 
+/**
+ * @brief Set Cache configuration for a specific function
+ *
+ * Note: AMD devices and recent Nvidia GPUS do not support reconfigurable cache.  This hint is ignored on those architectures.
+ *
+ */
 hipError_t hipCtxGetCacheConfig ( hipFuncCache *cacheConfig );
 
+/**
+ * @brief Set L1/Shared cache partition.
+ *
+ * Note: AMD devices and recent Nvidia GPUS do not support reconfigurable cache.  This hint is ignored on those architectures.
+ *
+ */
 hipError_t hipCtxSetCacheConfig ( hipFuncCache cacheConfig );
 
+/**
+ * @brief Set Shared memory bank configuration.
+ *
+ * Note: AMD devices and recent Nvidia GPUS do not support shared cache banking, and the hint is ignored on those architectures.
+ *
+ */
 hipError_t hipCtxSetSharedMemConfig ( hipSharedMemConfig config );
 
+/**
+ * @brief Get Shared memory bank configuration.
+ *
+ * Note: AMD devices and recent Nvidia GPUS do not support shared cache banking, and the hint is ignored on those architectures.
+ *
+ */
 hipError_t hipCtxGetSharedMemConfig ( hipSharedMemConfig * pConfig );
 
+/**
+ * @brief Blocks until the default context has completed all preceding requested tasks.
+ *
+ * This function waits for all streams on the default context to complete execution, and then returns.
+ *
+ * @returns #hipSuccess.
+*/
+hipError_t hipCtxSynchronize ( void );
+
+/**
+ * @brief Get flags used for creating current/default context.
+ *
+ * @param [out] flags
+ *
+ * @returns #hipSuccess.
+*/
+
+hipError_t hipCtxGetFlags ( unsigned int* flags );
+
+/**
+ * @brief Enables direct access to memory allocations in a peer context.
+ *
+ * Memory which already allocated on peer device will be mapped into the address space of the current device.  In addition, all
+ * future memory allocations on peerDeviceId will be mapped into the address space of the current device when the memory is allocated.
+ * The peer memory remains accessible from the current device until a call to hipDeviceDisablePeerAccess or hipDeviceReset.
+ *
+ *
+ * @param [in] peerCtx
+ * @param [in] flags
+ *
+ * Returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue,
+ * @returns #hipErrorPeerAccessAlreadyEnabled if peer access is already enabled for this device.
+ * @warning PeerToPeer support is experimental.
+ */
+hipError_t  hipCtxEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags);
+
+/**
+ * @brief Disable direct access from current context's virtual address space to memory allocations physically located on a peer context.Disables direct access to memory allocations in a peer context and unregisters any registered allocations.
+ *
+ * Returns hipErrorPeerAccessNotEnabled if direct access to memory on peerDevice has not yet been enabled from the current device.
+ *
+ * @param [in] peerCtx
+ *
+ * @returns #hipSuccess, #hipErrorPeerAccessNotEnabled
+ * @warning PeerToPeer support is experimental.
+ */
+hipError_t  hipCtxDisablePeerAccess (hipCtx_t peerCtx);
+// doxygen end Context Management
+/**
+ * @}
+ */
+
+
 // TODO-ctx
 /**
  * @return hipSuccess, hipErrorInvalidDevice
@@ -1099,12 +1248,84 @@ hipError_t hipDeviceGetFromId(hipDevice_t *device, int deviceId);
  */
 hipError_t hipDriverGetVersion(int *driverVersion) ;
 
+/**
+ * @brief Loads code object from file into a hipModule_t
+ *
+ * @param [in] fname
+ * @param [out] module
+ * 
+ * @returns hipSuccess, hipErrorInvalidValue, hipErrorInvalidContext, hipErrorFileNotFound, hipErrorOutOfMemory, hipErrorSharedObjectInitFailed, hipErrorNotInitialized
+ *
+ * 
+ */
+hipError_t hipModuleLoad(hipModule_t *module, const char *fname);
 
-hipError_t hipModuleLoad(hipModule *module, const char *fname);
+/**
+ * @brief Freeing the module
+ *
+ * @param [in] module
+ *
+ * @returns hipSuccess, hipInvalidValue
+ * module is freed and the code objects associated with it are destroyed
+ * 
+ */
 
-hipError_t hipModuleGetFunction(hipFunction *function, hipModule module, const char *kname);
+hipError_t hipModuleUnload(hipModule_t module);
 
-hipError_t hipLaunchModuleKernel(hipFunction f,
+/**
+ * @brief Function with kname will be extracted present in module
+ *
+ * @param [in] module
+ * @param [in] kname
+ * @param [out] function
+ *
+ * @returns hipSuccess, hipErrorInvalidValue, hipErrorInvalidContext, hipErrorNotInitialized, hipErrorNotFound, 
+ */
+hipError_t hipModuleGetFunction(hipFunction_t *function, hipModule_t module, const char *kname);
+
+/**
+ * @brief returns device memory pointer and size of the kernel present in the module with symbol - name
+ *
+ * @param [in] moodule
+ * @param [in] name
+ * @param [out] dptr
+ * @param [out[ bytes
+ *
+ * @returns hipSuccess, hipErrorInvalidValue, hipErrorNotInitialized
+ */
+hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, hipModule_t hmod, const char *name);
+
+/**
+ * @brief builds module from code object which resides in host memory. And image is pointer to that location.
+ *
+ * @param [in] image
+ * @param [out] module
+ *
+ * @returns hipSuccess, hipErrorNotInitialized, hipErrorOutOfMemory, hipErrorNotInitialized
+ */
+hipError_t hipModuleLoadData(hipModule_t *module, const void *image);
+
+/**
+ * @brief launches kernel f with launch parameters and shared memory on stream with arguments passed to kerneelparams or extra
+ *
+ * @param [in[ f
+ * @param [in] gridDimX
+ * @param [in] gridDimY
+ * @param [in] gridDimZ
+ * @param [in] blockDimX
+ * @param [in] blockDimY
+ * @param [in] blockDimZ
+ * @param [in] sharedMemBytes
+ * @param [in] stream
+ * @param [in] kernelParams
+ * @param [in] extraa
+ *
+ * The function takes the above arguments and run the kernel in hipFunction_t f.  with launch parameters specified in gridDimX, gridDimY, gridDimZ,  blockDimX, blockDimY and blockDimmZ. The amount of shared memory is specificed and can be used with HIP_DYNAMIC_SHARED. The arguemt extra is used to pass in the arguments for the kernel.
+ * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
+ *
+ * @warning kernellParams argument is not yet implemented in HIP. Please use extra instead. Please refer to hip_porting_driver_api.md for sample usage.
+ */
+hipError_t hipModuleLaunchKernel(hipFunction_t f,
                               unsigned int gridDimX,
                               unsigned int gridDimY,
                               unsigned int gridDimZ,
@@ -1114,7 +1335,7 @@ hipError_t hipLaunchModuleKernel(hipFunction f,
                               unsigned int sharedMemBytes,
                               hipStream_t stream,
                               void **kernelParams,
-                              void **extra) __attribute__((deprecated("kernelParams is not fully supported, use extra instead"))) ;
+                              void **extra) ;
 
 // doxygen end Version Management
 /**
diff --git a/include/hipComplex.h b/include/hipComplex.h
new file mode 100644
index 0000000000..27281a2df4
--- /dev/null
+++ b/include/hipComplex.h
@@ -0,0 +1,31 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <hip/hip_common.h>
+
+#if defined(__HIP_PLATFORM_HCC__) && !defined (__HIP_PLATFORM_NVCC__)
+#include <hip/hcc_detail/hipComplex.h>
+#elif defined(__HIP_PLATFORM_NVCC__) && !defined (__HIP_PLATFORM_HCC__)
+#include <hip/nvcc_detail/hipComplex.h>
+#else
+#error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
+#endif
+
diff --git a/include/nvcc_detail/hipComplex.h b/include/nvcc_detail/hipComplex.h
new file mode 100644
index 0000000000..b5c182bd4d
--- /dev/null
+++ b/include/nvcc_detail/hipComplex.h
@@ -0,0 +1,108 @@
+#ifndef HIPCOMPLEX_H
+#define HIPCOMPLEX_H
+
+#include"cuComplex.h"
+
+typedef cuFloatComplex hipFloatComplex;
+
+__device__ __host__ static inline float hipCrealf(hipFloatComplex z){
+    return cuCrealf(z);
+}
+
+__device__ __host__ static inline float hipCimagf(hipFloatComplex z){
+    return cuCimagf(z);
+}
+
+__device__ __host__ static inline hipFloatComplex make_hipFloatComplex(float a, float b){
+    return make_cuFloatComplex(a, b);
+}
+
+__device__ __host__ static inline hipFloatComplex hipConjf(hipFloatComplex z){
+    return cuConjf(z);
+}
+
+__device__ __host__ static inline float hipCsqabsf(hipFloatComplex z){
+    return cuCabsf(z) * cuCabsf(z);
+}
+
+__device__ __host__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q){
+    return cuCaddf(p, q);
+}
+
+__device__ __host__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q){
+    return cuCsubf(p, q);
+}
+
+__device__ __host__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q){
+    return cuCmulf(p, q);
+}
+
+__device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q){
+    return cuCdivf(p, q);
+}
+
+__device__ __host__ static inline float hipCabsf(hipFloatComplex z){
+    return cuCabsf(p, q);
+}
+
+typedef cuDoubleComplex hipDoubleComplex;
+
+__device__ __host__ static inline double hipCreal(hipDoubleComplex z){
+    return cuCreal(z);
+}
+
+__device__ __host__ static inline double hipCimag(hipDoubleComplex z){
+    return cuCimag(z);
+}
+
+__device__ __host__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b){
+    return make_cuDoubleComplex(a, b);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z){
+    return cuConj(z);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCsqabs(hipDoubleComplex z){
+    return cuCabs(z) * cuCabs(z);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q){
+    return cuCadd(p, q);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q){
+    return cuCsub(p, q);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q){
+    return cuCdiv(p, q);
+}
+
+__device__ __host__ static inline double hipCabs(hipDoubleComplex z){
+    return cuCabs(z);
+}
+
+typedef cuFloatComplex hipComplex;
+
+__device__ __host__ static inline hipComplex make_Complex(float x, float y){
+    return make_cuComplex(x, y);
+}
+
+__device__ __host__ static inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z){
+    return cuComplexDoubleToFloat(z);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z){
+    return cuComplexFloatToDouble(z);
+}
+
+__device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r){
+    return cuCfmaf(p, q, r);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCfma(hipComplex p, hipComplex q, hipComplex r){
+    return cuCfma(p, q, r);
+}
+
+#endif
diff --git a/include/nvcc_detail/hip_runtime_api.h b/include/nvcc_detail/hip_runtime_api.h
index a51bca2d02..baeb080195 100644
--- a/include/nvcc_detail/hip_runtime_api.h
+++ b/include/nvcc_detail/hip_runtime_api.h
@@ -22,7 +22,7 @@ THE SOFTWARE.
 #pragma once
 
 #include <cuda_runtime_api.h>
-
+#include <cuda.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -58,8 +58,20 @@ hipMemcpyHostToHost
 #define hipHostRegisterPortable cudaHostRegisterPortable
 #define hipHostRegisterMapped cudaHostRegisterMapped
 
+#define HIP_LAUNCH_PARAM_BUFFER_POINTER CU_LAUNCH_PARAM_BUFFER_POINTER
+#define HIP_LAUNCH_PARAM_BUFFER_SIZE     CU_LAUNCH_PARAM_BUFFER_SIZE
+#define HIP_LAUNCH_PARAM_END            CU_LAUNCH_PARAM_END
+
 typedef cudaEvent_t hipEvent_t;
 typedef cudaStream_t hipStream_t;
+typedef CUcontext hipCtx_t;
+typedef CUsharedconfig hipSharedMemConfig;
+typedef CUfunc_cache hipFuncCache;
+typedef CUdevice hipDevice_t;
+typedef CUmodule hipModule_t;
+typedef CUfunction hipFunction_t;
+typedef CUdeviceptr hipDeviceptr_t;
+
 //typedef cudaChannelFormatDesc hipChannelFormatDesc;
 #define hipChannelFormatDesc cudaChannelFormatDesc
 
@@ -85,6 +97,20 @@ switch(cuError) {
 };
 }
 
+inline static hipError_t hipCUResultTohipError(CUresult cuError) { //TODO Populate further
+switch(cuError) {
+    case CUDA_SUCCESS                            : return hipSuccess;
+    case CUDA_ERROR_OUT_OF_MEMORY                : return hipErrorMemoryAllocation            ;
+    case CUDA_ERROR_INVALID_VALUE                : return hipErrorInvalidValue                ;
+    case CUDA_ERROR_INVALID_DEVICE               : return hipErrorInvalidDevice               ;
+    case CUDA_ERROR_DEINITIALIZED                : return hipErrorDeinitialized               ;
+    case CUDA_ERROR_NO_DEVICE                    : return hipErrorNoDevice                    ;
+    case CUDA_ERROR_INVALID_CONTEXT              : return hipErrorInvalidContext              ;
+    case CUDA_ERROR_NOT_INITIALIZED              : return hipErrorNotInitialized              ;
+    default                                      : return hipErrorUnknown;  // Note - translated error.
+};
+}
+
 // TODO   match the error enum names of hip and cuda
 inline static cudaError_t hipErrorToCudaError(hipError_t hError) {
 switch(hError) {
@@ -124,6 +150,11 @@ default:
 }
 }
 
+inline static hipError_t hipInit(unsigned int flags)
+{
+    return hipCUResultTohipError(cuInit(flags));
+}
+
 inline static hipError_t hipDeviceReset() {
     return hipCUDAErrorTohipError(cudaDeviceReset());
 }
@@ -182,6 +213,25 @@ inline static hipError_t hipHostFree(void* ptr)  {
 inline static hipError_t hipSetDevice(int device) {
     return hipCUDAErrorTohipError(cudaSetDevice(device));
 }
+
+inline static hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, 
+                  void* src, size_t size)
+{
+    return hipCUResultTohipError(cuMemcpyHtoD(dst, src, size));
+}
+
+inline static hipError_t hipMemcpyDtoH(void* dst, 
+                  hipDeviceptr_t src, size_t size)
+{
+    return hipCUResultTohipError(cuMemcpyDtoH(dst, src, size));
+}
+
+inline static hipError_t hipMemcpyDtoD(hipDeviceptr_t dst,
+            hipDeviceptr_t src, size_t size)
+{
+    return hipCUResultTohipError(cuMemcpyDtoD(dst, src, size));
+}
+
 inline static hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind copyKind) {
   return hipCUDAErrorTohipError(cudaMemcpy(dst, src, sizeBytes, hipMemcpyKindToCudaMemcpyKind(copyKind)));
 }
@@ -347,20 +397,6 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att
     return hipCUDAErrorTohipError(cerror);
 }
 
-template<class T>
-inline static hipError_t hipOccupancyMaxPotentialBlockSize(
-        int *minGridSize,
-        int *blockSize,
-        T func,
-        size_t dynamicSMemSize = 0,
-        int blockSizeLimit = 0,
-        unsigned int flags = 0
-        ){
-    cudaError_t cerror;
-    cerror =  cudaOccupancyMaxPotentialBlockSize(minGridSize, blockSize, func, dynamicSMemSize, blockSizeLimit, flags);
-    return hipCUDAErrorTohipError(cerror);
-}
-
 inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
         int *numBlocks,
         const void* func,
@@ -458,7 +494,6 @@ inline static hipError_t hipDriverGetVersion(int *driverVersion)
 	return hipCUDAErrorTohipError(err);
 }
 
-
 inline static hipError_t hipDeviceCanAccessPeer ( int* canAccessPeer, int  device, int  peerDevice )
 {
     return hipCUDAErrorTohipError(cudaDeviceCanAccessPeer(canAccessPeer, device, peerDevice));
@@ -474,6 +509,16 @@ inline static hipError_t  hipDeviceEnablePeerAccess ( int  peerDevice, unsigned
     return hipCUDAErrorTohipError(cudaDeviceEnablePeerAccess ( peerDevice, flags ));
 }
 
+inline static hipError_t  hipCtxDisablePeerAccess ( hipCtx_t peerCtx )
+{
+    return hipCUResultTohipError(cuCtxDisablePeerAccess ( peerCtx ));
+}
+
+inline static hipError_t  hipCtxEnablePeerAccess ( hipCtx_t peerCtx, unsigned int  flags )
+{
+    return hipCUResultTohipError(cuCtxEnablePeerAccess ( peerCtx, flags ));
+}
+
 inline static hipError_t hipMemcpyPeer ( void* dst, int  dstDevice, const void* src, int  srcDevice, size_t count )
 {
     return hipCUDAErrorTohipError(cudaMemcpyPeer ( dst, dstDevice, src, srcDevice, count ));
@@ -499,12 +544,145 @@ inline static hipError_t hipEventQuery(hipEvent_t event)
 	return hipCUDAErrorTohipError(cudaEventQuery(event));
 }
 
+inline static hipError_t  hipCtxCreate(hipCtx_t *ctx, unsigned int flags,  hipDevice_t device)
+{
+    return hipCUResultTohipError(cuCtxCreate ( ctx,flags,device ));
+}
+
+inline static hipError_t  hipCtxDestroy(hipCtx_t ctx)
+{
+    return hipCUResultTohipError(cuCtxDestroy ( ctx ));
+}
+
+inline static hipError_t  hipCtxPopCurrent(hipCtx_t* ctx)
+{
+    return hipCUResultTohipError(cuCtxPopCurrent ( ctx ));
+}
+
+inline static hipError_t  hipCtxPushCurrent(hipCtx_t ctx)
+{
+    return hipCUResultTohipError(cuCtxPushCurrent ( ctx ));
+}
+
+inline static hipError_t  hipCtxSetCurrent(hipCtx_t ctx)
+{
+    return hipCUResultTohipError(cuCtxSetCurrent ( ctx ));
+}
+
+inline static hipError_t  hipCtxGetCurrent(hipCtx_t* ctx)
+{
+    return hipCUResultTohipError(cuCtxGetCurrent ( ctx ));
+}
+
+inline static hipError_t  hipCtxGetDevice(hipDevice_t *device)
+{
+    return hipCUResultTohipError(cuCtxGetDevice ( device ));
+}
+
+inline static hipError_t  hipCtxGetApiVersion (hipCtx_t ctx,int *apiVersion)
+{
+    return hipCUResultTohipError(cuCtxGetApiVersion ( ctx,(unsigned int*)apiVersion ));
+}
+
+inline static hipError_t  hipCtxGetCacheConfig ( hipFuncCache *cacheConfig )
+{
+    return hipCUResultTohipError(cuCtxGetCacheConfig ( cacheConfig ));
+}
+
+inline static hipError_t  hipCtxSetCacheConfig ( hipFuncCache cacheConfig )
+{
+    return hipCUResultTohipError(cuCtxSetCacheConfig ( cacheConfig ));
+}
+
+inline static hipError_t  hipCtxSetSharedMemConfig ( hipSharedMemConfig config )
+{
+    return hipCUResultTohipError(cuCtxSetSharedMemConfig ( config ));
+}
+
+inline static hipError_t  hipCtxGetSharedMemConfig ( hipSharedMemConfig * pConfig )
+{
+    return hipCUResultTohipError(cuCtxGetSharedMemConfig ( pConfig ));
+}
+
+inline static hipError_t  hipCtxSynchronize ( void )
+{
+    return hipCUResultTohipError(cuCtxSynchronize ( ));
+}
+
+inline static hipError_t  hipCtxGetFlags ( unsigned int* flags )
+{
+    return hipCUResultTohipError(cuCtxGetFlags ( flags ));
+}
+
+inline static hipError_t hipCtxDetach(hipCtx_t ctx)
+{
+    return hipCUResultTohipError(cuCtxDetach(ctx));
+}
+
+inline static hipError_t hipDeviceGet(hipDevice_t *device, int ordinal)
+{
+    return hipCUResultTohipError(cuDeviceGet(device, ordinal));
+}
+
+inline static hipError_t hipModuleLoad(hipModule_t *module, const char* fname)
+{
+    return hipCUResultTohipError(cuModuleLoad(module, fname));
+}
+
+inline static hipError_t hipModuleUnload(hipModule_t hmod)
+{
+    return hipCUResultTohipError(cuModuleUnload(hmod));
+}
+
+inline static hipError_t hipModuleGetFunction(hipFunction_t *function,
+                         hipModule_t module, const char *kname)
+{
+    return hipCUResultTohipError(cuModuleGetFunction(function, module, kname));
+}
+
+inline static hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes,
+                         hipModule_t hmod, const char* name)
+{
+    return hipCUResultTohipError(cuModuleGetGlobal(dptr, bytes, hmod, name));
+}
+
+inline static hipError_t hipModuleLoadData(hipModule_t *module, const void *image)
+{
+    return hipCUResultTohipError(cuModuleLoadData(module, image));
+}
+
+inline static hipError_t hipModuleLaunchKernel(hipFunction_t f,
+      unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,
+      unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
+      unsigned int sharedMemBytes, hipStream_t stream,
+      void **kernelParams, void **extra)
+{
+    return hipCUResultTohipError(cuLaunchKernel(f, 
+                    gridDimX, gridDimY, gridDimZ,
+                    blockDimX, blockDimY, blockDimZ,
+                    sharedMemBytes, stream, kernelParams, extra));
+}
+
 #ifdef __cplusplus
 }
 #endif
 
 #ifdef __CUDACC__
 
+template<class T>
+inline static hipError_t hipOccupancyMaxPotentialBlockSize(
+        int *minGridSize,
+        int *blockSize,
+        T func,
+        size_t dynamicSMemSize = 0,
+        int blockSizeLimit = 0,
+        unsigned int flags = 0
+        ){
+    cudaError_t cerror;
+    cerror =  cudaOccupancyMaxPotentialBlockSize(minGridSize, blockSize, func, dynamicSMemSize, blockSizeLimit, flags);
+    return hipCUDAErrorTohipError(cerror);
+}
+
 template <class T, int dim, enum cudaTextureReadMode readMode>
 inline static hipError_t  hipBindTexture(size_t *offset,
 			                             const struct texture<T, dim, readMode> &tex,
diff --git a/samples/0_Intro/module_api/Makefile b/samples/0_Intro/module_api/Makefile
new file mode 100644
index 0000000000..25ae7b8411
--- /dev/null
+++ b/samples/0_Intro/module_api/Makefile
@@ -0,0 +1,21 @@
+HIP_PATH?= $(wildcard /opt/rocm/hip)
+ifeq (,$(HIP_PATH))
+	HIP_PATH=../../..
+endif
+HIPCC=$(HIP_PATH)/bin/hipcc
+HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --compiler)
+
+ifeq (${HIP_PLATFORM}, hcc)
+	GENCODEOBJECT_FLAGS=--target-isa-fiji
+endif
+
+all: vcpy_isa.compile runKernel.hip.out
+
+vcpy_isa.compile: vcpy_isa.cpp
+	$(HIPCC) --gencodeobject $(GENCODEOBJECT_FLAGS) vcpy_isa.cpp -o vcpy_isa.co
+
+runKernel.hip.out: runKernel.cpp
+	$(HIPCC) runKernel.cpp -o runKernel.hip.out
+
+clean:
+	rm -f *.co *.out
diff --git a/samples/0_Intro/module_api/runKernel.cpp b/samples/0_Intro/module_api/runKernel.cpp
new file mode 100644
index 0000000000..e4fa1b6d93
--- /dev/null
+++ b/samples/0_Intro/module_api/runKernel.cpp
@@ -0,0 +1,105 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include<hip_runtime.h>
+#include<hip_runtime_api.h>
+#include<iostream>
+#include<fstream>
+#include<vector>
+
+#define LEN 64
+#define SIZE LEN<<2
+
+#ifdef __HIP_PLATFORM_HCC__ 
+#define fileName "vcpy_isa.co"
+#define kernel_name "ZN12_GLOBAL__N_146_Z11hello_world16grid_launch_parmPfS0__functor19__cxxamp_trampolineEiiiiiiPKfPf"
+#endif
+
+#ifdef __HIP_PLATFORM_NVCC__
+#define fileName "vcpy_isa.ptx"
+#define kernel_name "hello_world"
+#endif
+
+int main(){
+    float *A, *B;
+    hipDeviceptr_t Ad, Bd;
+    A = new float[LEN];
+    B = new float[LEN];
+
+    for(uint32_t i=0;i<LEN;i++){
+        A[i] = i*1.0f;
+        B[i] = 0.0f;
+    }
+
+
+#ifdef __HIP_PLATFORM_NVCC__
+  	hipInit(0);
+	  hipDevice_t device;
+	  hipCtx_t context;
+	  hipDeviceGet(&device, 0);
+	  hipCtxCreate(&context, 0, device);
+#endif
+
+    hipMalloc((void**)&Ad, SIZE);
+    hipMalloc((void**)&Bd, SIZE);
+
+    hipMemcpyHtoD(Ad, A, SIZE);
+    hipMemcpyHtoD(Bd, B, SIZE);
+    hipModule_t Module;
+    hipFunction_t Function;
+    hipModuleLoad(&Module, fileName);
+    hipModuleGetFunction(&Function, Module, kernel_name);
+
+  uint32_t len = LEN;
+  uint32_t one = 1;
+
+    std::vector<void*>argBuffer(5);
+    uint32_t *ptr32_t = (uint32_t*)&argBuffer[0];
+    memcpy(ptr32_t + 0, &one, sizeof(uint32_t));
+    memcpy(ptr32_t + 1, &one, sizeof(uint32_t));
+    memcpy(ptr32_t + 2, &one, sizeof(uint32_t));
+    memcpy(ptr32_t + 3, &len, sizeof(uint32_t));
+    memcpy(ptr32_t + 4, &one, sizeof(uint32_t));
+    memcpy(ptr32_t + 5, &one, sizeof(uint32_t));
+    memcpy(&argBuffer[3], &Ad, sizeof(void*));
+    memcpy(&argBuffer[4], &Bd, sizeof(void*));
+
+
+
+    size_t size = argBuffer.size()*sizeof(void*);
+
+    void *config[] = {
+      HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0],
+      HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
+      HIP_LAUNCH_PARAM_END
+    };
+
+    hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&config); 
+
+    hipMemcpyDtoH(B, Bd, SIZE);
+    for(uint32_t i=LEN-4;i<LEN;i++){
+        std::cout<<A[i]<<" - "<<B[i]<<std::endl;
+    }
+
+#ifdef __HIP_PLATFORM_NVCC__
+	  hipCtxDetach(context);
+#endif 
+
+    return 0;
+}
diff --git a/samples/0_Intro/module_api/vcpy_isa.cpp b/samples/0_Intro/module_api/vcpy_isa.cpp
new file mode 100644
index 0000000000..ead3253115
--- /dev/null
+++ b/samples/0_Intro/module_api/vcpy_isa.cpp
@@ -0,0 +1,9 @@
+#include<hip_runtime.h>
+
+__global__ void hello_world(hipLaunchParm lp, float *a, float *b)
+{
+    int tx = hipThreadIdx_x;
+    b[tx] = a[tx];
+}
+
+int main(){}
diff --git a/samples/0_Intro/module_api/vcpy_isa.cu b/samples/0_Intro/module_api/vcpy_isa.cu
new file mode 100644
index 0000000000..d2a0838604
--- /dev/null
+++ b/samples/0_Intro/module_api/vcpy_isa.cu
@@ -0,0 +1,6 @@
+
+extern "C" __global__ void hello_world(float *a, float *b)
+{
+	int tx = threadIdx.x;
+	b[tx] = a[tx];
+}
diff --git a/samples/0_Intro/module_api/vcpy_isa.ptx b/samples/0_Intro/module_api/vcpy_isa.ptx
new file mode 100644
index 0000000000..62eb3f63df
--- /dev/null
+++ b/samples/0_Intro/module_api/vcpy_isa.ptx
@@ -0,0 +1,38 @@
+//
+// Generated by NVIDIA NVVM Compiler
+//
+// Compiler Build ID: CL-19856038
+// Cuda compilation tools, release 7.5, V7.5.17
+// Based on LLVM 3.4svn
+//
+
+.version 4.3
+.target sm_20
+.address_size 64
+
+	// .globl	hello_world
+
+.visible .entry hello_world(
+	.param .u64 hello_world_param_0,
+	.param .u64 hello_world_param_1
+)
+{
+	.reg .f32 	%f<2>;
+	.reg .b32 	%r<2>;
+	.reg .b64 	%rd<8>;
+
+
+	ld.param.u64 	%rd1, [hello_world_param_0];
+	ld.param.u64 	%rd2, [hello_world_param_1];
+	cvta.to.global.u64 	%rd3, %rd2;
+	cvta.to.global.u64 	%rd4, %rd1;
+	mov.u32 	%r1, %tid.x;
+	mul.wide.s32 	%rd5, %r1, 4;
+	add.s64 	%rd6, %rd4, %rd5;
+	ld.global.f32 	%f1, [%rd6];
+	add.s64 	%rd7, %rd3, %rd5;
+	st.global.f32 	[%rd7], %f1;
+	ret;
+}
+
+
diff --git a/src/hip_context.cpp b/src/hip_context.cpp
index ee9e37a1a1..e19c45d2c3 100644
--- a/src/hip_context.cpp
+++ b/src/hip_context.cpp
@@ -29,7 +29,22 @@ THE SOFTWARE.
 // Stack of contexts
 thread_local std::stack<ihipCtx_t *>  tls_ctxStack;
 
+hipError_t ihipCtxStackUpdate()
+{
+    //HIP_INIT_API();
+    hipError_t e = hipSuccess;
 
+    if(tls_ctxStack.empty()) {
+            tls_ctxStack.push(ihipGetTlsDefaultCtx());
+    }
+
+    return ihipLogStatus(e);
+}
+
+/**
+ * @return #hipSuccess, #hipErrorInvalidValue
+ */
+//---
 hipError_t hipInit(unsigned int flags)
 {
     HIP_INIT_API(flags);
@@ -44,7 +59,10 @@ hipError_t hipInit(unsigned int flags)
     return ihipLogStatus(e);
 }
 
-
+/**
+ * @return #hipSuccess
+ */
+//---
 hipError_t hipCtxCreate(hipCtx_t *ctx, unsigned int flags,  hipDevice_t device)
 {
     HIP_INIT_API(ctx, flags, device); // FIXME - review if we want to init
@@ -57,7 +75,10 @@ hipError_t hipCtxCreate(hipCtx_t *ctx, unsigned int flags,  hipDevice_t device)
     return ihipLogStatus(e);
 }
 
-
+/**
+ * @return #hipSuccess, #hipErrorInvalidDevice
+ */
+//---
 hipError_t hipDeviceGet(hipDevice_t *device, int deviceId)
 {
     HIP_INIT_API(device, deviceId); // FIXME - review if we want to init
@@ -88,20 +109,39 @@ hipError_t hipDriverGetVersion(int *driverVersion)
     return ihipLogStatus(hipSuccess);
 }
 
+/**
+ * @return #hipSuccess, #hipErrorInvalidValue
+ */
+//---
 hipError_t hipCtxDestroy(hipCtx_t ctx)
 {
+    HIP_INIT_API(ctx);
     hipError_t e = hipSuccess;
     ihipCtx_t* currentCtx= ihipGetTlsDefaultCtx();
-    if(currentCtx == ctx) {
-        //need to destroy the ctx associated with calling thread
-        tls_ctxStack.pop();
+    ihipCtx_t* primaryCtx= ((ihipDevice_t*)ctx->getDevice())->_primaryCtx;
+    if(primaryCtx== ctx)
+    {
+        e = hipErrorInvalidValue;
     }
-    delete ctx; //As per CUDA docs , attempting to access ctx from those threads which has this ctx as current, will result in the error HIP_ERROR_CONTEXT_IS_DESTROYED.
+    else
+    {
+        if(currentCtx == ctx) {
+            //need to destroy the ctx associated with calling thread
+            tls_ctxStack.pop();
+        }
+        delete ctx; //As per CUDA docs , attempting to access ctx from those threads which has this ctx as current, will result in the error HIP_ERROR_CONTEXT_IS_DESTROYED.
+    }
+
     return ihipLogStatus(e);
 }
 
+/**
+ * @return #hipSuccess
+ */
+//---
 hipError_t hipCtxPopCurrent(hipCtx_t* ctx)
 {
+    HIP_INIT_API(ctx);
     hipError_t e = hipSuccess;
     ihipCtx_t* tempCtx;
     *ctx = ihipGetTlsDefaultCtx();
@@ -119,8 +159,13 @@ hipError_t hipCtxPopCurrent(hipCtx_t* ctx)
     return ihipLogStatus(e);
 }
 
+/**
+ * @return #hipSuccess, #hipErrorInvalidContext
+ */
+//---
 hipError_t hipCtxPushCurrent(hipCtx_t ctx)
 {
+    HIP_INIT_API(ctx);
     hipError_t e = hipSuccess;
     if(ctx != NULL) {    //TODO- is this check needed?
         ihipSetTlsDefaultCtx(ctx);
@@ -132,19 +177,30 @@ hipError_t hipCtxPushCurrent(hipCtx_t ctx)
     return ihipLogStatus(e);
 }
 
+/**
+ * @return #hipSuccess
+ */
+//---
 hipError_t hipCtxGetCurrent(hipCtx_t* ctx)
 {
+    HIP_INIT_API(ctx);
     hipError_t e = hipSuccess;
-
-    *ctx = ihipGetTlsDefaultCtx();
-    if(*ctx == nullptr) {
-        *ctx = NULL;    //TODO - is it required? Can return nullptr?
+    if(!tls_ctxStack.empty()) {
+        *ctx= tls_ctxStack.top();
+    }
+    else {
+        *ctx = NULL;
     }
     return ihipLogStatus(e);
 }
 
+/**
+ * @return #hipSuccess
+ */
+//---
 hipError_t hipCtxSetCurrent(hipCtx_t ctx)
 {
+    HIP_INIT_API(ctx);
     hipError_t e = hipSuccess;
     if(ctx == NULL) {
         tls_ctxStack.pop();
@@ -156,8 +212,13 @@ hipError_t hipCtxSetCurrent(hipCtx_t ctx)
     return ihipLogStatus(e);
 }
 
+/**
+ * @return #hipSuccess, #hipErrorInvalidContext
+ */
+//---
 hipError_t hipCtxGetDevice(hipDevice_t *device)
 {
+    HIP_INIT_API(device);
     hipError_t e = hipSuccess;
 
     ihipCtx_t *ctx = ihipGetTlsDefaultCtx();
@@ -171,6 +232,10 @@ hipError_t hipCtxGetDevice(hipDevice_t *device)
     return ihipLogStatus(e);
 }
 
+/**
+ * @return #hipSuccess
+ */
+//---
 hipError_t hipCtxGetApiVersion (hipCtx_t ctx,int *apiVersion)
 {
     HIP_INIT_API(apiVersion);
@@ -182,6 +247,10 @@ hipError_t hipCtxGetApiVersion (hipCtx_t ctx,int *apiVersion)
     return ihipLogStatus(hipSuccess);
 }
 
+/**
+ * @return #hipSuccess
+ */
+//---
 hipError_t hipCtxGetCacheConfig ( hipFuncCache *cacheConfig )
 {
     HIP_INIT_API(cacheConfig);
@@ -191,6 +260,10 @@ hipError_t hipCtxGetCacheConfig ( hipFuncCache *cacheConfig )
     return ihipLogStatus(hipSuccess);
 }
 
+/**
+ * @return #hipSuccess
+ */
+//---
 hipError_t hipCtxSetCacheConfig ( hipFuncCache cacheConfig )
 {
     HIP_INIT_API(cacheConfig);
@@ -200,6 +273,10 @@ hipError_t hipCtxSetCacheConfig ( hipFuncCache cacheConfig )
     return ihipLogStatus(hipSuccess);
 }
 
+/**
+ * @return #hipSuccess
+ */
+//---
 hipError_t hipCtxSetSharedMemConfig ( hipSharedMemConfig config )
 {
     HIP_INIT_API(config);
@@ -209,6 +286,10 @@ hipError_t hipCtxSetSharedMemConfig ( hipSharedMemConfig config )
     return ihipLogStatus(hipSuccess);
 }
 
+/**
+ * @return #hipSuccess
+ */
+//---
 hipError_t hipCtxGetSharedMemConfig ( hipSharedMemConfig * pConfig )
 {
     HIP_INIT_API(pConfig);
@@ -216,4 +297,28 @@ hipError_t hipCtxGetSharedMemConfig ( hipSharedMemConfig * pConfig )
     *pConfig = hipSharedMemBankSizeFourByte;
 
     return ihipLogStatus(hipSuccess);
-}
\ No newline at end of file
+}
+
+/**
+ * @return #hipSuccess
+ */
+//---
+hipError_t hipCtxSynchronize ( void )
+{
+    HIP_INIT_API(1);
+    return ihipSynchronize(); //TODP Shall check validity of ctx?
+}
+
+/**
+ * @return #hipSuccess
+ */
+//---
+hipError_t hipCtxGetFlags ( unsigned int* flags )
+{
+    HIP_INIT_API(flags);
+    hipError_t e = hipSuccess;
+    ihipCtx_t* tempCtx;
+    tempCtx = ihipGetTlsDefaultCtx();
+    *flags = tempCtx->_ctxFlags;
+    return ihipLogStatus(e);
+}
diff --git a/src/hip_device.cpp b/src/hip_device.cpp
index e3d7fefa91..14338c11de 100644
--- a/src/hip_device.cpp
+++ b/src/hip_device.cpp
@@ -160,11 +160,8 @@ hipError_t hipSetDevice(int deviceId)
  */
 hipError_t hipDeviceSynchronize(void)
 {
-    HIP_INIT_API();
-
-    ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits for all activity to finish.
-
-    return ihipLogStatus(hipSuccess);
+    HIP_INIT_API(1);
+    return ihipSynchronize();
 }
 
 
diff --git a/src/hip_event.cpp b/src/hip_event.cpp
index ca30c3c62b..4324583690 100644
--- a/src/hip_event.cpp
+++ b/src/hip_event.cpp
@@ -33,13 +33,14 @@ hipError_t ihipEventCreate(hipEvent_t* event, unsigned flags)
 
     // TODO - support hipEventDefault, hipEventBlockingSync, hipEventDisableTiming
     if (flags == 0) {
-        ihipEvent_t *eh = event->_handle = new ihipEvent_t();
+        ihipEvent_t *eh = new ihipEvent_t();
 
         eh->_state  = hipEventStatusCreated;
         eh->_stream = NULL;
         eh->_flags  = flags;
         eh->_timestamp  = 0;
         eh->_copySeqId  = 0;
+        *event = eh;
     } else {
         e = hipErrorInvalidValue;
     }
@@ -71,7 +72,7 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream)
 {
     HIP_INIT_API(event, stream);
 
-    ihipEvent_t *eh = event._handle;
+    ihipEvent_t *eh = event;
     if (eh && eh->_state != hipEventStatusUnitialized)   {
         eh->_stream = stream;
 
@@ -106,10 +107,10 @@ hipError_t hipEventDestroy(hipEvent_t event)
 {
     HIP_INIT_API(event);
 
-    event._handle->_state  = hipEventStatusUnitialized;
+    event->_state  = hipEventStatusUnitialized;
 
-    delete event._handle;
-    event._handle = NULL;
+    delete event;
+    event = NULL;
 
     // TODO - examine return additional error codes
     return ihipLogStatus(hipSuccess);
@@ -121,7 +122,7 @@ hipError_t hipEventSynchronize(hipEvent_t event)
 {
     HIP_INIT_API(event);
 
-    ihipEvent_t *eh = event._handle;
+    ihipEvent_t *eh = event;
 
     if (eh) {
         if (eh->_state == hipEventStatusUnitialized) {
@@ -150,8 +151,8 @@ hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop)
 {
     HIP_INIT_API(ms, start, stop);
 
-    ihipEvent_t *start_eh = start._handle;
-    ihipEvent_t *stop_eh = stop._handle;
+    ihipEvent_t *start_eh = start;
+    ihipEvent_t *stop_eh = stop;
 
     ihipSetTs(start);
     ihipSetTs(stop);
@@ -195,7 +196,7 @@ hipError_t hipEventQuery(hipEvent_t event)
 {
     HIP_INIT_API(event);
 
-    ihipEvent_t *eh = event._handle;
+    ihipEvent_t *eh = event;
 
     // TODO-stream - need to read state of signal here:  The event may have become ready after recording..
     // TODO-HCC - use get_hsa_signal here.
diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp
index f0d123b64a..ea8604b7c4 100644
--- a/src/hip_hcc.cpp
+++ b/src/hip_hcc.cpp
@@ -158,7 +158,12 @@ ihipCtx_t *ihipGetTlsDefaultCtx()
     return tls_defaultCtx;
 }
 
+hipError_t ihipSynchronize(void)
+{
+    ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits for all activity to finish.
 
+    return ihipLogStatus(hipSuccess);
+}
 
 //=================================================================================================
 // ihipSignal_t:
@@ -482,6 +487,53 @@ int ihipStream_t::preCopyCommand(LockedAccessor_StreamCrit_t &crit, ihipSignal_t
 }
 
 
+void ihipStream_t::launchModuleKernel(hsa_signal_t signal,
+                        uint32_t blockDimX,
+                        uint32_t blockDimY,
+                        uint32_t blockDimZ,
+                        uint32_t gridDimX,
+                        uint32_t gridDimY,
+                        uint32_t gridDimZ,
+                        uint32_t sharedMemBytes,
+                        void *kernarg,
+                        size_t kernSize,
+                        uint64_t kernel){
+    hsa_status_t status;
+    void *kern;
+    hsa_amd_memory_pool_t *pool = reinterpret_cast<hsa_amd_memory_pool_t*>(_av.get_hsa_kernarg_region());
+    status = hsa_amd_memory_pool_allocate(*pool, kernSize, 0, &kern);
+    status = hsa_amd_agents_allow_access(1, (hsa_agent_t*)_av.get_hsa_agent(), 0, kern);
+    memcpy(kern, kernarg, kernSize);
+    hsa_queue_t *Queue = (hsa_queue_t*)_av.get_hsa_queue();
+    const uint32_t queue_mask = Queue->size-1;
+    uint32_t packet_index = hsa_queue_load_write_index_relaxed(Queue);
+    hsa_kernel_dispatch_packet_t *dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(Queue->base_address))[packet_index & queue_mask]);
+
+    dispatch_packet->completion_signal = signal;
+    dispatch_packet->workgroup_size_x = blockDimX;
+    dispatch_packet->workgroup_size_y = blockDimY;
+    dispatch_packet->workgroup_size_z = blockDimZ;
+    dispatch_packet->grid_size_x = blockDimX * gridDimX;
+    dispatch_packet->grid_size_y = blockDimY * gridDimY;
+    dispatch_packet->grid_size_z = blockDimZ * gridDimZ;
+    dispatch_packet->group_segment_size = 0;
+    dispatch_packet->private_segment_size = sharedMemBytes;
+    dispatch_packet->kernarg_address = kern;
+    dispatch_packet->kernel_object = kernel;
+    uint16_t header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
+    (1 << HSA_PACKET_HEADER_BARRIER) |
+    (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
+    (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
+
+    uint16_t setup = 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
+    uint32_t header32 = header | (setup << 16);
+
+    __atomic_store_n((uint32_t*)(dispatch_packet), header32, __ATOMIC_RELEASE);
+
+    hsa_queue_store_write_index_relaxed(Queue, packet_index + 1);
+    hsa_signal_store_relaxed(Queue->doorbell_signal, packet_index);
+}
+
 
 //=============================================================================
 // Recompute the peercnt and the packed _peerAgents whenever a peer is added or deleted.
@@ -1260,13 +1312,20 @@ hipStream_t ihipSyncAndResolveStream(hipStream_t stream)
     }
 }
 
+void ihipPrintKernelLaunch(const char *kernelName, const grid_launch_parm *lp, const hipStream_t stream) 
+{
+    std::string streamString = ToString(stream);
+    fprintf(stderr, KGRN "<<hip-api: hipLaunchKernel '%s' gridDim:(%d,%d,%d) groupDim:(%d,%d,%d) groupMem:+%d %s\n" KNRM, \
+            kernelName, lp->grid_dim.x, lp->grid_dim.y, lp->grid_dim.z, lp->group_dim.x, lp->group_dim.y, lp->group_dim.z, 
+            lp->dynamic_group_mem_bytes, streamString.c_str());\
+}
 
 // TODO - data-up to data-down:
 // Called just before a kernel is launched from hipLaunchKernel.
 // Allows runtime to track some information about the stream.
 hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm *lp)
 {
-    HIP_INIT_API(stream, grid, block, lp);
+    HIP_INIT();
     stream = ihipSyncAndResolveStream(stream);
 #if USE_GRID_LAUNCH_20
     lp->grid_dim.x = grid.x;
@@ -1439,7 +1498,7 @@ const char *ihipErrorString(hipError_t hip_error)
 
 void ihipSetTs(hipEvent_t e)
 {
-    ihipEvent_t *eh = e._handle;
+    ihipEvent_t *eh = e;
     if (eh->_state == hipEventStatusRecorded) {
         // already recorded, done:
         return;
@@ -1509,7 +1568,7 @@ void ihipStream_t::setAsyncCopyAgents(unsigned kind, ihipCommand_t *commandType,
 }
 
 
-void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const void* src, size_t sizeBytes, unsigned kind)
+void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn)
 {
     ihipCtx_t *ctx = this->getCtx();
     const ihipDevice_t *device = ctx->getDevice();
@@ -1528,7 +1587,7 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const
     bool dstInDeviceMem = dstPtrInfo._isInDeviceMem;
 
     // Resolve default to a specific Kind so we know which algorithm to use:
-    if (kind == hipMemcpyDefault) {
+    if (kind == hipMemcpyDefault && resolveOn) {
         kind = resolveMemcpyDirection(srcTracked, dstTracked, srcInDeviceMem, dstInDeviceMem);
     };
 
@@ -1699,14 +1758,12 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const
 
 
 // Sync copy that acquires lock:
-void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, unsigned kind)
+void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn)
 {
     LockedAccessor_StreamCrit_t crit (_criticalData);
-    copySync(crit, dst, src, sizeBytes, kind);
+    copySync(crit, dst, src, sizeBytes, kind, resolveOn);
 }
 
-
-
 void ihipStream_t::copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind)
 {
     LockedAccessor_StreamCrit_t crit(_criticalData);
diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp
index c0282dc372..a64ad94e81 100644
--- a/src/hip_memory.cpp
+++ b/src/hip_memory.cpp
@@ -446,12 +446,96 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind
         e = ex._code;
     }
 
+    return ihipLogStatus(e);
+}
 
+hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t sizeBytes)
+{
+    HIP_INIT_API(dst, src, sizeBytes);
+
+    hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
+
+    hc::completion_future marker;
+
+    hipError_t e = hipSuccess;
+
+    try {
+
+        stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyHostToDevice, false);
+    }
+    catch (ihipException ex) {
+        e = ex._code;
+    }
 
     return ihipLogStatus(e);
 }
 
 
+hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t sizeBytes)
+{
+    HIP_INIT_API(dst, src, sizeBytes);
+
+    hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
+
+    hc::completion_future marker;
+
+    hipError_t e = hipSuccess;
+
+    try {
+
+        stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyDeviceToHost, false);
+    }
+    catch (ihipException ex) {
+        e = ex._code;
+    }
+
+    return ihipLogStatus(e);
+}
+
+hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes)
+{
+    HIP_INIT_API(dst, src, sizeBytes);
+
+    hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
+
+    hc::completion_future marker;
+
+    hipError_t e = hipSuccess;
+
+    try {
+
+        stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyDeviceToDevice, false);
+    }
+    catch (ihipException ex) {
+        e = ex._code;
+    }
+
+    return ihipLogStatus(e);
+}
+
+hipError_t hipMemcpyHtoH(void* dst, void* src, size_t sizeBytes)
+{
+    HIP_INIT_API(dst, src, sizeBytes);
+
+    hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
+
+    hc::completion_future marker;
+
+    hipError_t e = hipSuccess;
+
+    try {
+
+        stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyHostToHost, false);
+    }
+    catch (ihipException ex) {
+        e = ex._code;
+    }
+
+    return ihipLogStatus(e);
+}
+
+
+
 /**
  * @result #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidMemcpyDirection,
  * @result #hipErrorInvalidValue : If dst==NULL or src==NULL, or other bad argument.
@@ -786,7 +870,6 @@ hipError_t hipMemGetInfo  (size_t *free, size_t *total)
             // TODO - replace with kernel-level for reporting free memory:
             size_t deviceMemSize, hostMemSize, userMemSize;
             hc::am_memtracker_sizeinfo(device->_acc, &deviceMemSize, &hostMemSize, &userMemSize);
-            printf ("deviceMemSize=%zu\n", deviceMemSize);
 
             *free =  device->_props.totalGlobalMem - deviceMemSize;
         }
diff --git a/src/hip_module.cpp b/src/hip_module.cpp
index 1421eb0329..e43cc62829 100644
--- a/src/hip_module.cpp
+++ b/src/hip_module.cpp
@@ -20,137 +20,188 @@ THE SOFTWARE.
 #include "hip_runtime.h"
 #include "hsa/hsa.h"
 #include "hsa/hsa_ext_amd.h"
+#include "hsa/amd_hsa_kernel_code.h"
 #include "hcc_detail/hip_hcc.h"
 #include "hcc_detail/trace_helper.h"
 #include <fstream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <elf.h>
 
 //TODO Use Pool APIs from HCC to get memory regions.
 
 namespace hipdrv{
-hsa_status_t findSystemRegions(hsa_region_t region, void *data){
-  hsa_region_segment_t segment_id;
-  hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment_id);
 
-  if(segment_id != HSA_REGION_SEGMENT_GLOBAL){
-    return HSA_STATUS_SUCCESS;
-  }
+    hsa_status_t findSystemRegions(hsa_region_t region, void *data){
+        hsa_region_segment_t segment_id;
+        hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment_id);
 
-  hsa_region_global_flag_t flags;
-  hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
+        if(segment_id != HSA_REGION_SEGMENT_GLOBAL){
+            return HSA_STATUS_SUCCESS;
+        }
 
-  hsa_region_t *reg = (hsa_region_t*)data;
+        hsa_region_global_flag_t flags;
+        hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
 
-  if(flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED){
-    *reg = region;
-  }
+        hsa_region_t *reg = (hsa_region_t*)data;
 
-  return HSA_STATUS_SUCCESS;
+        if(flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED){
+            *reg = region;
+        }
+
+        return HSA_STATUS_SUCCESS;
+    }
+
+}   // End namespace hipdrv
+
+uint64_t PrintSymbolSizes(const void *emi, const char *name){
+    const Elf64_Ehdr *ehdr = (const Elf64_Ehdr*)emi;
+    if(NULL == ehdr || EV_CURRENT != ehdr->e_version){}
+    const Elf64_Shdr * shdr = (const Elf64_Shdr*)((char*)emi + ehdr->e_shoff);
+    for(uint16_t i=0;i<ehdr->e_shnum;++i){
+        if(shdr[i].sh_type == SHT_SYMTAB){
+            const Elf64_Sym *syms = (const Elf64_Sym*)((char*)emi + shdr[i].sh_offset);
+            assert(syms);
+            uint64_t numSyms = shdr[i].sh_size/shdr[i].sh_entsize;
+            const char* strtab = (const char*)((char*)emi + shdr[shdr[i].sh_link].sh_offset);
+            assert(strtab);
+            for(uint64_t i=0;i<numSyms;++i){
+                const char *symname = strtab + syms[i].st_name;
+                assert(symname);
+                uint64_t size = syms[i].st_size;
+                if(strcmp(name, symname) == 0){
+                    return size;
+                }
+            }
+        }
+    }
+    return 0;
 }
 
-hsa_status_t findKernArgRegions(hsa_region_t region, void *data){
-  hsa_region_segment_t segment_id;
-  hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment_id);
+uint64_t ElfSize(const void *emi){
+    const Elf64_Ehdr *ehdr = (const Elf64_Ehdr*)emi;
+    const Elf64_Shdr *shdr = (const Elf64_Shdr*)((char*)emi + ehdr->e_shoff);
 
-  if(segment_id != HSA_REGION_SEGMENT_GLOBAL){
-    return HSA_STATUS_SUCCESS;
-  }
+    uint64_t max_offset = ehdr->e_shoff;
+    uint64_t total_size = max_offset + ehdr->e_shentsize * ehdr->e_shnum;
 
-  hsa_region_global_flag_t flags;
-  hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
-
-  hsa_region_t *reg = (hsa_region_t*)data;
-
-  if(flags & HSA_REGION_GLOBAL_FLAG_KERNARG){
-    *reg = region;
-  }
-
-  return HSA_STATUS_SUCCESS;
+    for(uint16_t i=0;i < ehdr->e_shnum;++i){
+        uint64_t cur_offset = static_cast<uint64_t>(shdr[i].sh_offset);
+        if(max_offset < cur_offset){
+            max_offset = cur_offset;
+            total_size = max_offset;
+            if(SHT_NOBITS != shdr[i].sh_type){
+                total_size += static_cast<uint64_t>(shdr[i].sh_size);
+            }
+        }
+    }
+    return total_size;
 }
 
-
-}
-
-
-
-hipError_t hipModuleLoad(hipModule *module, const char *fname){
+hipError_t hipModuleLoad(hipModule_t *module, const char *fname){
     HIP_INIT_API(fname);
     hipError_t ret = hipSuccess;
+    *module = new ihipModule_t;
+
     if(module == NULL){
-        ret = hipErrorInvalidValue;
+        return hipErrorInvalidValue;
     }
+
     auto ctx = ihipGetTlsDefaultCtx();
     if(ctx == nullptr){
         ret = hipErrorInvalidContext;
+
     }else{
         int deviceId = ctx->getDevice()->_deviceId;
         ihipDevice_t *currentDevice = ihipGetDevice(deviceId);
         std::ifstream in(fname, std::ios::binary | std::ios::ate);
+
         if(!in){
             return hipErrorFileNotFound;
+
         }else{
+
+            *module = new ihipModule_t;
             size_t size = std::string::size_type(in.tellg());
             void *p = NULL;
             hsa_agent_t agent = currentDevice->_hsaAgent;
             hsa_region_t sysRegion;
             hsa_status_t status = hsa_agent_iterate_regions(agent, hipdrv::findSystemRegions, &sysRegion);
             status = hsa_memory_allocate(sysRegion, size, (void**)&p);
+
             if(status != HSA_STATUS_SUCCESS){
                 return hipErrorOutOfMemory;
             }
+
             char *ptr = (char*)p;
             if(!ptr){
                 return hipErrorOutOfMemory;
-                std::cout<<"Error: failed to allocate memory for code object"<<std::endl;
             }
-            hsa_code_object_t obj;
+            (*module)->ptr = p;
+            (*module)->size = size;
             in.seekg(0, std::ios::beg);
             std::copy(std::istreambuf_iterator<char>(in),
                       std::istreambuf_iterator<char>(), ptr);
-            status = hsa_code_object_deserialize(ptr, size, NULL, &obj);
+
+            status = hsa_code_object_deserialize(ptr, size, NULL, &(*module)->object);
+
             if(status != HSA_STATUS_SUCCESS){
                 return hipErrorSharedObjectInitFailed;
             }
-            *module = obj.handle;
+
+            status = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, NULL, &(*module)->executable);
+            if(status != HSA_STATUS_SUCCESS){
+                return hipErrorNotInitialized;
+            }
         }
     }
+
     return ret;
 }
 
-hipError_t hipModuleGetFunction(hipFunction *func, hipModule hmod, const char *name){
-    HIP_INIT_API(name);
+hipError_t hipModuleUnload(hipModule_t hmod){
+    hipError_t ret = hipSuccess;
+    hsa_status_t status = hsa_executable_destroy(hmod->executable);
+    if(status != HSA_STATUS_SUCCESS){ret = hipErrorInvalidValue; }
+    status = hsa_code_object_destroy(hmod->object);
+    if(status != HSA_STATUS_SUCCESS){ret = hipErrorInvalidValue; }
+    delete hmod;
+    return ret;
+}
+
+hipError_t ihipModuleGetFunction(hipFunction_t *func, hipModule_t hmod, const char *name){
     auto ctx = ihipGetTlsDefaultCtx();
     hipError_t ret = hipSuccess;
-    if(name == nullptr || hmod == 0){
+
+    if(name == nullptr){
         return hipErrorInvalidValue;
     }
+
     if(ctx == nullptr){
         ret = hipErrorInvalidContext;
+
     }else{
+        *func = new ihipFunction_t;
         int deviceId = ctx->getDevice()->_deviceId;
         ihipDevice_t *currentDevice = ihipGetDevice(deviceId);
         hsa_agent_t gpuAgent = (hsa_agent_t)currentDevice->_hsaAgent;
 
         hsa_status_t status;
-        hsa_executable_symbol_t kernel_symbol;
-        hsa_executable_t executable;
-        status = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, NULL, &executable);
+        status = hsa_executable_load_code_object(hmod->executable, gpuAgent, hmod->object, NULL);
         if(status != HSA_STATUS_SUCCESS){
             return hipErrorNotInitialized;
         }
-        hsa_code_object_t obj;
-        obj.handle = hmod;
-        status = hsa_executable_load_code_object(executable, gpuAgent, obj, NULL);
-        if(status != HSA_STATUS_SUCCESS){
-            return hipErrorNotInitialized;
-        }
-        status = hsa_executable_freeze(executable, NULL);
-        status = hsa_executable_get_symbol(executable, NULL, name, gpuAgent, 0, &kernel_symbol);
+
+        status = hsa_executable_freeze(hmod->executable, NULL);
+        status = hsa_executable_get_symbol(hmod->executable, NULL, name, gpuAgent, 0, &(*func)->kernel_symbol);
         if(status != HSA_STATUS_SUCCESS){
             return hipErrorNotFound;
         }
-        status = hsa_executable_symbol_get_info(kernel_symbol,
+
+        status = hsa_executable_symbol_get_info((*func)->kernel_symbol,
                                    HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
-                                   func);
+                                   &(*func)->kernel);
+
         if(status != HSA_STATUS_SUCCESS){
             return hipErrorNotFound;
         }
@@ -158,16 +209,24 @@ hipError_t hipModuleGetFunction(hipFunction *func, hipModule hmod, const char *n
     return ret;
 }
 
-hipError_t hipLaunchModuleKernel(hipFunction f,
+hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod,
+                                const char *name){
+    HIP_INIT_API(name);
+    return ihipModuleGetFunction(hfunc, hmod, name);
+}
+
+hipError_t hipModuleLaunchKernel(hipFunction_t f,
             uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ,
             uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
             uint32_t sharedMemBytes, hipStream_t hStream,
             void **kernelParams, void **extra){
-    HIP_INIT_API(f);
+    HIP_INIT_API(f->kernel);
     auto ctx = ihipGetTlsDefaultCtx();
     hipError_t ret = hipSuccess;
+
     if(ctx == nullptr){
         ret = hipErrorInvalidDevice;
+
     }else{
         int deviceId = ctx->getDevice()->_deviceId;
         ihipDevice_t *currentDevice = ihipGetDevice(deviceId);
@@ -189,66 +248,100 @@ hipError_t hipLaunchModuleKernel(hipFunction f,
 /*
 Kernel argument preparation.
 */
-
-        hsa_region_t kernArg;
-        hsa_status_t status = hsa_agent_iterate_regions(gpuAgent, hipdrv::findKernArgRegions, &kernArg);
-        void *kern;
-        status = hsa_memory_allocate(kernArg, kernSize, &kern);
-        if(status != HSA_STATUS_SUCCESS){
-            return hipErrorLaunchOutOfResources;
-        }
-        memcpy(kern, config[1], kernSize);
-
+        hsa_status_t status;
+        grid_launch_parm lp;
+        hStream = ihipPreLaunchKernel(hStream, 0, 0, &lp);
 
 /*
-Pre kernel launch 
-
-    stream = ihipSyncAndResolveStream(stream);
-    stream->lockopen_preKernelCommand();
-    hc::accelerator_view av = &stream->_av;
-    hc::completion_future cf = new hc::completion_future;
+  Create signal
 */
 
-        hStream = ihipSyncAndResolveStream(hStream);
-        hc::accelerator_view *av = &hStream->_av;
-        hsa_queue_t *Queue = (hsa_queue_t*)av->get_hsa_queue();
         hsa_signal_t signal;
         status = hsa_signal_create(1, 0, NULL, &signal);
 
 /*
-Creating the packets
+  Launch AQL packet
+*/
+        hStream->launchModuleKernel(signal, blockDimX, blockDimY, blockDimZ,
+                  gridDimX, gridDimY, gridDimZ, sharedMemBytes, config[1], kernSize, f->kernel);
+
+/*
+  Wait for signal
 */
 
-        const uint32_t queue_mask = Queue->size-1;
-        uint32_t packet_index = hsa_queue_load_write_index_relaxed(Queue);
-        hsa_kernel_dispatch_packet_t *dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(Queue->base_address))[packet_index & queue_mask]);
-
-        dispatch_packet->completion_signal = signal;
-        dispatch_packet->workgroup_size_x = blockDimX;
-        dispatch_packet->workgroup_size_y = blockDimY;
-        dispatch_packet->workgroup_size_z = blockDimZ;
-        dispatch_packet->grid_size_x = blockDimX * gridDimX;
-        dispatch_packet->grid_size_y = blockDimY * gridDimY;
-        dispatch_packet->grid_size_z = blockDimZ * gridDimZ;
-
-        dispatch_packet->group_segment_size = 0;
-        dispatch_packet->private_segment_size = sharedMemBytes;
-        dispatch_packet->kernarg_address = kern;
-        dispatch_packet->kernel_object = f;
-        uint16_t header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
-        (1 << HSA_PACKET_HEADER_BARRIER) |
-        (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
-        (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
-
-
-        uint16_t setup = 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
-        uint32_t header32 = header | (setup << 16);
-
-        __atomic_store_n((uint32_t*)(dispatch_packet), header32, __ATOMIC_RELEASE);
-
-        hsa_queue_store_write_index_relaxed(Queue, packet_index+1);
-        hsa_signal_store_relaxed(Queue->doorbell_signal, packet_index);
         hsa_signal_value_t value = hsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
+
+
+        ihipPostLaunchKernel(hStream, lp);
+                
+    }
+
+    return ret;
+}
+
+
+hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes,
+                              hipModule_t hmod, const char* name){
+    HIP_INIT_API(name);
+    hipError_t ret = hipSuccess;
+    if(dptr == NULL || bytes == NULL){
+        return hipErrorInvalidValue;
+    }
+    if(name == NULL || hmod == NULL){
+        return hipErrorNotInitialized;
+    }
+    else{
+        hipFunction_t func;
+        ihipModuleGetFunction(&func, hmod, name);
+        *bytes = PrintSymbolSizes(hmod->ptr, name) + sizeof(amd_kernel_code_t);
+        *dptr = reinterpret_cast<void*>(func->kernel);
+        return ret;
+    }
+}
+
+hipError_t hipModuleLoadData(hipModule_t *module, const void *image){
+    HIP_INIT_API(image);
+    hipError_t ret = hipSuccess;
+    if(image == NULL || module == NULL){
+        return hipErrorNotInitialized;
+    }else{
+        auto ctx = ihipGetTlsDefaultCtx();
+        *module = new ihipModule_t;
+        int deviceId = ctx->getDevice()->_deviceId;
+        ihipDevice_t *currentDevice = ihipGetDevice(deviceId);
+
+        void *p;
+        uint64_t size = ElfSize(image);
+        hsa_agent_t agent = currentDevice->_hsaAgent;
+        hsa_region_t sysRegion;
+        hsa_status_t status = hsa_agent_iterate_regions(agent, hipdrv::findSystemRegions, &sysRegion);
+        status = hsa_memory_allocate(sysRegion, size, (void**)&p);
+
+        if(status != HSA_STATUS_SUCCESS){
+            return hipErrorOutOfMemory;
+        }
+
+        char *ptr = (char*)p;
+        if(!ptr){
+           return hipErrorOutOfMemory;
+        }
+        (*module)->ptr = p;
+        (*module)->size = size;
+
+        memcpy(ptr, image, size);
+
+        status = hsa_code_object_deserialize(ptr, size, NULL, &(*module)->object);
+
+        if(status != HSA_STATUS_SUCCESS){
+            return hipErrorSharedObjectInitFailed;
+        }
+
+        status = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, NULL, &(*module)->executable);
+        if(status != HSA_STATUS_SUCCESS){
+            return hipErrorNotInitialized;
+        }
     }
     return ret;
 }
+
+
diff --git a/src/hip_peer.cpp b/src/hip_peer.cpp
index fd51599815..8f75d99918 100644
--- a/src/hip_peer.cpp
+++ b/src/hip_peer.cpp
@@ -67,7 +67,7 @@ hipError_t hipDeviceCanAccessPeer (int* canAccessPeer, hipCtx_t thisCtx, hipCtx_
 //---
 // Disable visibility of this device into memory allocated on peer device.
 // Remove this device from peer device peerlist.
-hipError_t hipDeviceDisablePeerAccess (hipCtx_t peerCtx)
+hipError_t ihipDisablePeerAccess (hipCtx_t peerCtx)
 {
     HIP_INIT_API(peerCtx);
 
@@ -109,7 +109,7 @@ hipError_t hipDeviceDisablePeerAccess (hipCtx_t peerCtx)
 //---
 // Allow the current device to see all memory allocated on peerDevice.
 // This should add this device to the peer-device peer list.
-hipError_t hipDeviceEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags)
+hipError_t ihipEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags)
 {
     HIP_INIT_API(peerCtx, flags);
 
@@ -175,7 +175,7 @@ hipError_t hipDeviceDisablePeerAccess (int peerDeviceId)
 {
     HIP_INIT_API(peerDeviceId);
 
-    return hipDeviceDisablePeerAccess(ihipGetPrimaryCtx(peerDeviceId));
+    return ihipDisablePeerAccess(ihipGetPrimaryCtx(peerDeviceId));
 }
 
 
@@ -183,7 +183,7 @@ hipError_t hipDeviceEnablePeerAccess (int peerDeviceId, unsigned int flags)
 {
     HIP_INIT_API(peerDeviceId, flags);
 
-    return hipDeviceEnablePeerAccess(ihipGetPrimaryCtx(peerDeviceId), flags);
+    return ihipEnablePeerAccess(ihipGetPrimaryCtx(peerDeviceId), flags);
 }
 
 
@@ -200,6 +200,16 @@ hipError_t hipMemcpyPeerAsync (void* dst, int  dstDevice, const void* src, int
     return hipMemcpyPeerAsync(dst, ihipGetPrimaryCtx(dstDevice), src, ihipGetPrimaryCtx(srcDevice), sizeBytes, stream);
 }
 
+hipError_t hipCtxEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags)
+{
+    HIP_INIT_API(peerCtx, flags);
 
+    return ihipEnablePeerAccess(peerCtx, flags);
+}
 
+hipError_t hipCtxDisablePeerAccess (hipCtx_t peerCtx)
+{
+    HIP_INIT_API(peerCtx);
 
+    return ihipDisablePeerAccess(peerCtx);
+}
diff --git a/tests/src/hipComplex.cpp b/tests/src/hipComplex.cpp
new file mode 100644
index 0000000000..2e9c1884a1
--- /dev/null
+++ b/tests/src/hipComplex.cpp
@@ -0,0 +1,53 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include<iostream>
+#include<hip/hip_runtime.h>
+#include<hip/hip_runtime_api.h>
+#include<hip/hcc_detail/hipComplex.h>
+
+#define LEN 64
+#define SIZE 64<<2
+
+__global__  void getSqAbs(hipLaunchParm lp, float *A, float *B, float *C){
+    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    C[tx] = hipCsqabsf(make_hipFloatComplex(A[tx], B[tx]));
+}
+
+int main(){
+    float *A, *Ad, *B, *Bd, *C, *Cd;
+    A = new float[LEN];
+    B = new float[LEN];
+    C = new float[LEN];
+    for(uint32_t i=0;i<LEN;i++){
+        A[i] = i*1.0f;
+        B[i] = i*1.0f;
+        C[i] = i*1.0f;
+    }
+
+    hipMalloc((void**)&Ad, SIZE);
+    hipMalloc((void**)&Bd, SIZE);
+    hipMalloc((void**)&Cd, SIZE);
+    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
+    hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
+    hipLaunchKernel(getSqAbs, dim3(1), dim3(LEN), 0, 0, Ad, Bd, Cd);
+    hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost);
+    std::cout<<A[11]<<" "<<B[11]<<" "<<C[11]<<std::endl;
+}
diff --git a/tests/src/hipDeviceMemcpy.cpp b/tests/src/hipDeviceMemcpy.cpp
new file mode 100644
index 0000000000..7c1cc59808
--- /dev/null
+++ b/tests/src/hipDeviceMemcpy.cpp
@@ -0,0 +1,42 @@
+#include<iostream>
+#include<hip/hip_runtime.h>
+#include<hip/hip_runtime_api.h>
+
+#define LEN 1030
+#define SIZE LEN << 2
+
+__global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In, uint32_t *Vald)
+{
+    memcpy(Out, In, SIZE, Vald);
+}
+
+__global__ void set(hipLaunchParm lp, uint32_t *ptr, uint8_t val, size_t size)
+{
+    memset(ptr, val, size);
+}
+
+int main()
+{
+    uint32_t *A, *Ad, *B, *Bd;
+    uint32_t *Val, *Vald;
+    A = new uint32_t[LEN];
+    B = new uint32_t[LEN];
+    Val = new uint32_t;
+    *Val = 0;
+    for(int i=0;i<LEN;i++){
+        A[i] = i *1.0f;
+        B[i] = 0.0f;
+    }
+    hipMalloc((void**)&Ad, SIZE);
+    hipMalloc((void**)&Bd, SIZE);
+    hipMalloc((void**)&Vald, sizeof(uint32_t));
+    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
+    hipLaunchKernel(cpy, dim3(1), dim3(LEN/4), 0, 0, Bd, Ad, Vald);
+    hipLaunchKernel(set, dim3(1), dim3(LEN/4), 0, 0, Bd, 0x1, SIZE);
+    hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
+    hipMemcpy(Val, Vald, sizeof(uint32_t), hipMemcpyDeviceToHost);
+    for(int i=LEN-16;i<LEN;i++){
+        std::cout<<A[i]<<" "<<B[i]<<std::endl;
+    }
+    std::cout<<*Val<<std::endl;
+}
diff --git a/tests/src/hipDrvMemcpy.cpp b/tests/src/hipDrvMemcpy.cpp
new file mode 100644
index 0000000000..55c9b18818
--- /dev/null
+++ b/tests/src/hipDrvMemcpy.cpp
@@ -0,0 +1,26 @@
+#include<iostream>
+#include<hip/hip_runtime.h>
+#include<hip/hip_runtime_api.h>
+
+#define LEN 1024
+#define SIZE LEN<<2
+
+int main(){
+  int *A, *B, *C;
+  hipDeviceptr Ad, Bd;
+  A = new int[LEN];
+  B = new int[LEN];
+  C = new int[LEN];
+  for(int i=0;i<LEN;i++){
+    A[i] = i;
+  }
+  hipMalloc((void**)&Ad, SIZE);
+  hipMalloc((void**)&Bd, SIZE);
+  hipMemcpyHtoD(Ad, A, SIZE);
+  hipMemcpyDtoD(Bd, Ad, SIZE);
+  hipMemcpyDtoH(B, Bd, SIZE);
+  hipMemcpyHtoH(C, B,SIZE);
+  for(int i=0;i<16;i++){
+    std::cout<<A[i]<<" "<<C[i]<<std::endl;
+  }
+}
diff --git a/tests/src/hipModule.cpp b/tests/src/hipModule.cpp
new file mode 100644
index 0000000000..86ccb3ab9f
--- /dev/null
+++ b/tests/src/hipModule.cpp
@@ -0,0 +1,85 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include<hip_runtime.h>
+#include<hip_runtime_api.h>
+#include<iostream>
+#include<fstream>
+#include<vector>
+
+#define LEN 64
+#define SIZE LEN<<2
+
+#define fileName "vcpy_isa.co"
+#define kernel_name "hello_world"
+
+__global__ void Cpy(hipLaunchParm lp, float *Ad, float* Bd){
+  int tx = hipThreadIdx_x;
+  Bd[tx] = Ad[tx];
+}
+
+int main(){
+  float *A, *B, *Ad, *Bd;
+  A = new float[LEN];
+  B = new float[LEN];
+
+  for(uint32_t i=0;i<LEN;i++){
+    A[i] = i*1.0f;
+    B[i] = 0.0f;
+    std::cout<<A[i] << " "<<B[i]<<std::endl;
+  }
+
+  hipMalloc((void**)&Ad, SIZE);
+  hipMalloc((void**)&Bd, SIZE);
+
+  hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
+  hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
+  hipModule_t Module;
+  hipFunction_t Function;
+  hipModuleLoad(&Module, fileName);
+  hipModuleGetFunction(&Function, Module, kernel_name);
+  hipStream_t stream;
+  hipStreamCreate(&stream);
+  void *args[2] = {&Ad, &Bd};
+
+
+    std::vector<void*>argBuffer(2);
+    memcpy(&argBuffer[0], &Ad, sizeof(void*));
+    memcpy(&argBuffer[1], &Bd, sizeof(void*));
+
+    size_t size = argBuffer.size()*sizeof(void*);
+
+    void *config[] = {
+      HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0],
+      HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
+      HIP_LAUNCH_PARAM_END
+    };
+
+    hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, stream, NULL, (void**)&config); 
+
+    hipStreamDestroy(stream);
+
+ hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
+
+  for(uint32_t i=0;i<LEN;i++){
+  std::cout<<A[i]<<" - "<<B[i]<<std::endl;
+  }
+
+  return 0;
+}
diff --git a/tests/src/hipModuleUnload.cpp b/tests/src/hipModuleUnload.cpp
new file mode 100644
index 0000000000..2e37414c2c
--- /dev/null
+++ b/tests/src/hipModuleUnload.cpp
@@ -0,0 +1,31 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include<hip/hip_runtime.h>
+#include<hip/hip_runtime_api.h>
+#include<iostream>
+
+#define fileName "vcpy_isa.co"
+
+int main(){
+    hipModule_t module;
+    hipModuleLoad(&module, fileName);
+    hipModuleUnload(module);
+}
+
diff --git a/tests/src/sampleModule.cpp b/tests/src/sampleModule.cpp
new file mode 100644
index 0000000000..606b19717d
--- /dev/null
+++ b/tests/src/sampleModule.cpp
@@ -0,0 +1,94 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include<hip_runtime.h>
+#include<hip_runtime_api.h>
+#include<iostream>
+#include<fstream>
+#include<vector>
+
+#define LEN 64
+#define SIZE LEN<<2
+
+#ifdef __HIP_PLATFORM_HCC__ 
+#define fileName "vcpy_isa.co"
+#endif
+
+#ifdef __HIP_PLATFORM_NVCC__
+#define fileName "vcpy_isa.ptx"
+#endif
+
+#define kernel_name "hello_world"
+
+int main(){
+    float *A, *B;
+    hipDeviceptr_t Ad, Bd;
+    A = new float[LEN];
+    B = new float[LEN];
+
+    for(uint32_t i=0;i<LEN;i++){
+        A[i] = i*1.0f;
+        B[i] = 0.0f;
+        std::cout<<A[i] << " "<<B[i]<<std::endl;
+    }
+
+
+#ifdef __HIP_PLATFORM_NVCC__
+  	hipInit(0);
+	  hipDevice_t device;
+	  hipCtx_t context;
+	  hipDeviceGet(&device, 0);
+	  hipCtxCreate(&context, 0, device);
+#endif
+
+    hipMalloc((void**)&Ad, SIZE);
+    hipMalloc((void**)&Bd, SIZE);
+
+    hipMemcpyHtoD(Ad, A, SIZE);
+    hipMemcpyHtoD(Bd, B, SIZE);
+    hipModule_t Module;
+    hipFunction_t Function;
+    hipModuleLoad(&Module, fileName);
+    hipModuleGetFunction(&Function, Module, kernel_name);
+
+    std::vector<void*>argBuffer(2);
+    memcpy(&argBuffer[0], &Ad, sizeof(void*));
+    memcpy(&argBuffer[1], &Bd, sizeof(void*));
+
+    size_t size = argBuffer.size()*sizeof(void*);
+
+    void *config[] = {
+      HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0],
+      HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
+      HIP_LAUNCH_PARAM_END
+    };
+
+    hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&config); 
+
+    hipMemcpyDtoH(B, Bd, SIZE);
+    for(uint32_t i=0;i<LEN;i++){
+        std::cout<<A[i]<<" - "<<B[i]<<std::endl;
+    }
+
+#ifdef __HIP_PLATFORM_NVCC__
+	  hipCtxDetach(context);
+#endif 
+
+    return 0;
+}
diff --git a/tests/src/vcpy_isa.co b/tests/src/vcpy_isa.co
new file mode 100755
index 0000000000..96105ee74b
Binary files /dev/null and b/tests/src/vcpy_isa.co differ
diff --git a/tests/src/vcpy_isa.cu b/tests/src/vcpy_isa.cu
new file mode 100644
index 0000000000..d2a0838604
--- /dev/null
+++ b/tests/src/vcpy_isa.cu
@@ -0,0 +1,6 @@
+
+extern "C" __global__ void hello_world(float *a, float *b)
+{
+	int tx = threadIdx.x;
+	b[tx] = a[tx];
+}
diff --git a/tests/src/vcpy_isa.ptx b/tests/src/vcpy_isa.ptx
new file mode 100644
index 0000000000..62eb3f63df
--- /dev/null
+++ b/tests/src/vcpy_isa.ptx
@@ -0,0 +1,38 @@
+//
+// Generated by NVIDIA NVVM Compiler
+//
+// Compiler Build ID: CL-19856038
+// Cuda compilation tools, release 7.5, V7.5.17
+// Based on LLVM 3.4svn
+//
+
+.version 4.3
+.target sm_20
+.address_size 64
+
+	// .globl	hello_world
+
+.visible .entry hello_world(
+	.param .u64 hello_world_param_0,
+	.param .u64 hello_world_param_1
+)
+{
+	.reg .f32 	%f<2>;
+	.reg .b32 	%r<2>;
+	.reg .b64 	%rd<8>;
+
+
+	ld.param.u64 	%rd1, [hello_world_param_0];
+	ld.param.u64 	%rd2, [hello_world_param_1];
+	cvta.to.global.u64 	%rd3, %rd2;
+	cvta.to.global.u64 	%rd4, %rd1;
+	mov.u32 	%r1, %tid.x;
+	mul.wide.s32 	%rd5, %r1, 4;
+	add.s64 	%rd6, %rd4, %rd5;
+	ld.global.f32 	%f1, [%rd6];
+	add.s64 	%rd7, %rd3, %rd5;
+	st.global.f32 	[%rd7], %f1;
+	ret;
+}
+
+