From 921736782e879c75d07bd36b5c8d17a69258f216 Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Fri, 19 Aug 2016 08:49:34 -0500
Subject: [PATCH 01/56] Added support for executable and symbols for data
 structures - symbol handle is added to hipFunction - executable handle is
 added to hipModule - This way, the APIs doesn't need to track the values

Change-Id: I7cf05329cf79fe946319d7746bd9f5503268fda4
---
 include/hcc_detail/hip_runtime_api.h |  10 ++-
 src/hip_module.cpp                   | 120 ++++++++++++++++++---------
 2 files changed, 87 insertions(+), 43 deletions(-)
diff --git a/include/hcc_detail/hip_runtime_api.h b/include/hcc_detail/hip_runtime_api.h
index 0316f73b57..9917619039 100644
--- a/include/hcc_detail/hip_runtime_api.h
+++ b/include/hcc_detail/hip_runtime_api.h
@@ -52,9 +52,15 @@ typedef struct ihipDevice_t *hipDevice_t;
 
 typedef struct ihipStream_t *hipStream_t;
 
-typedef uint64_t hipFunction;
+typedef struct {
+uint64_t kernel;
+uint64_t symbol;
+}hipFunction;
 
-typedef uint64_t hipModule;
+typedef struct{
+uint64_t executable;
+uint64_t object;
+} hipModule;
 
 typedef struct hipEvent_t {
     struct ihipEvent_t *_handle;
diff --git a/src/hip_module.cpp b/src/hip_module.cpp
index 1421eb0329..48dc6574a0 100644
--- a/src/hip_module.cpp
+++ b/src/hip_module.cpp
@@ -20,6 +20,7 @@ THE SOFTWARE.
 #include "hip_runtime.h"
 #include "hsa/hsa.h"
 #include "hsa/hsa_ext_amd.h"
+#include "hsa/amd_hsa_kernel_code.h"
 #include "hcc_detail/hip_hcc.h"
 #include "hcc_detail/trace_helper.h"
 #include <fstream>
@@ -27,66 +28,71 @@ THE SOFTWARE.
 //TODO Use Pool APIs from HCC to get memory regions.
 
 namespace hipdrv{
-hsa_status_t findSystemRegions(hsa_region_t region, void *data){
-  hsa_region_segment_t segment_id;
-  hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment_id);
 
-  if(segment_id != HSA_REGION_SEGMENT_GLOBAL){
-    return HSA_STATUS_SUCCESS;
-  }
+    hsa_status_t findSystemRegions(hsa_region_t region, void *data){
+        hsa_region_segment_t segment_id;
+        hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment_id);
 
-  hsa_region_global_flag_t flags;
-  hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
+        if(segment_id != HSA_REGION_SEGMENT_GLOBAL){
+            return HSA_STATUS_SUCCESS;
+        }
 
-  hsa_region_t *reg = (hsa_region_t*)data;
+        hsa_region_global_flag_t flags;
+        hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
 
-  if(flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED){
-    *reg = region;
-  }
+        hsa_region_t *reg = (hsa_region_t*)data;
 
-  return HSA_STATUS_SUCCESS;
-}
+        if(flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED){
+            *reg = region;
+        }
 
-hsa_status_t findKernArgRegions(hsa_region_t region, void *data){
-  hsa_region_segment_t segment_id;
-  hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment_id);
+        return HSA_STATUS_SUCCESS;
+    }
 
-  if(segment_id != HSA_REGION_SEGMENT_GLOBAL){
-    return HSA_STATUS_SUCCESS;
-  }
+    hsa_status_t findKernArgRegions(hsa_region_t region, void *data){
+        hsa_region_segment_t segment_id;
+        hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment_id);
 
-  hsa_region_global_flag_t flags;
-  hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
+        if(segment_id != HSA_REGION_SEGMENT_GLOBAL){
+            return HSA_STATUS_SUCCESS;
+        }
 
-  hsa_region_t *reg = (hsa_region_t*)data;
+        hsa_region_global_flag_t flags;
+        hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
 
-  if(flags & HSA_REGION_GLOBAL_FLAG_KERNARG){
-    *reg = region;
-  }
+        hsa_region_t *reg = (hsa_region_t*)data;
 
-  return HSA_STATUS_SUCCESS;
-}
+        if(flags & HSA_REGION_GLOBAL_FLAG_KERNARG){
+            *reg = region;
+        }
 
+        return HSA_STATUS_SUCCESS;
+    }
 
-}
+}   // End namespace hipdrv
 
 
 
 hipError_t hipModuleLoad(hipModule *module, const char *fname){
     HIP_INIT_API(fname);
     hipError_t ret = hipSuccess;
+
     if(module == NULL){
-        ret = hipErrorInvalidValue;
+        return hipErrorInvalidValue;
     }
+
     auto ctx = ihipGetTlsDefaultCtx();
     if(ctx == nullptr){
         ret = hipErrorInvalidContext;
+
     }else{
         int deviceId = ctx->getDevice()->_deviceId;
         ihipDevice_t *currentDevice = ihipGetDevice(deviceId);
         std::ifstream in(fname, std::ios::binary | std::ios::ate);
+
         if(!in){
             return hipErrorFileNotFound;
+
         }else{
             size_t size = std::string::size_type(in.tellg());
             void *p = NULL;
@@ -97,22 +103,35 @@ hipError_t hipModuleLoad(hipModule *module, const char *fname){
             if(status != HSA_STATUS_SUCCESS){
                 return hipErrorOutOfMemory;
             }
+
             char *ptr = (char*)p;
             if(!ptr){
                 return hipErrorOutOfMemory;
-                std::cout<<"Error: failed to allocate memory for code object"<<std::endl;
             }
+
             hsa_code_object_t obj;
             in.seekg(0, std::ios::beg);
             std::copy(std::istreambuf_iterator<char>(in),
                       std::istreambuf_iterator<char>(), ptr);
             status = hsa_code_object_deserialize(ptr, size, NULL, &obj);
+
             if(status != HSA_STATUS_SUCCESS){
                 return hipErrorSharedObjectInitFailed;
             }
-            *module = obj.handle;
+
+            module->object = obj.handle;
+
+            hsa_executable_t executable;
+            status = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, NULL, &executable);
+            if(status != HSA_STATUS_SUCCESS){
+                return hipErrorNotInitialized;
+            }
+
+            module->executable = executable.handle;
+
         }
     }
+
     return ret;
 }
 
@@ -120,41 +139,52 @@ hipError_t hipModuleGetFunction(hipFunction *func, hipModule hmod, const char *n
     HIP_INIT_API(name);
     auto ctx = ihipGetTlsDefaultCtx();
     hipError_t ret = hipSuccess;
-    if(name == nullptr || hmod == 0){
+
+    if(name == nullptr){
         return hipErrorInvalidValue;
     }
+
     if(ctx == nullptr){
         ret = hipErrorInvalidContext;
+
     }else{
         int deviceId = ctx->getDevice()->_deviceId;
         ihipDevice_t *currentDevice = ihipGetDevice(deviceId);
         hsa_agent_t gpuAgent = (hsa_agent_t)currentDevice->_hsaAgent;
 
         hsa_status_t status;
-        hsa_executable_symbol_t kernel_symbol;
         hsa_executable_t executable;
-        status = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, NULL, &executable);
+        hsa_executable_symbol_t kernel_symbol;
+        executable.handle = hmod.executable;
         if(status != HSA_STATUS_SUCCESS){
             return hipErrorNotInitialized;
         }
+
         hsa_code_object_t obj;
-        obj.handle = hmod;
+        obj.handle = hmod.object;
         status = hsa_executable_load_code_object(executable, gpuAgent, obj, NULL);
         if(status != HSA_STATUS_SUCCESS){
             return hipErrorNotInitialized;
         }
+
         status = hsa_executable_freeze(executable, NULL);
         status = hsa_executable_get_symbol(executable, NULL, name, gpuAgent, 0, &kernel_symbol);
         if(status != HSA_STATUS_SUCCESS){
             return hipErrorNotFound;
         }
+
         status = hsa_executable_symbol_get_info(kernel_symbol,
                                    HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
-                                   func);
+                                   &func->kernel);
+
         if(status != HSA_STATUS_SUCCESS){
             return hipErrorNotFound;
         }
+
+        func->symbol = kernel_symbol.handle;
+
     }
+
     return ret;
 }
 
@@ -163,11 +193,13 @@ hipError_t hipLaunchModuleKernel(hipFunction f,
             uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
             uint32_t sharedMemBytes, hipStream_t hStream,
             void **kernelParams, void **extra){
-    HIP_INIT_API(f);
+    HIP_INIT_API(f.kernel);
     auto ctx = ihipGetTlsDefaultCtx();
     hipError_t ret = hipSuccess;
+
     if(ctx == nullptr){
         ret = hipErrorInvalidDevice;
+
     }else{
         int deviceId = ctx->getDevice()->_deviceId;
         ihipDevice_t *currentDevice = ihipGetDevice(deviceId);
@@ -194,12 +226,15 @@ Kernel argument preparation.
         hsa_status_t status = hsa_agent_iterate_regions(gpuAgent, hipdrv::findKernArgRegions, &kernArg);
         void *kern;
         status = hsa_memory_allocate(kernArg, kernSize, &kern);
+
         if(status != HSA_STATUS_SUCCESS){
             return hipErrorLaunchOutOfResources;
         }
+
         memcpy(kern, config[1], kernSize);
 
 
+
 /*
 Pre kernel launch 
 
@@ -215,6 +250,10 @@ Pre kernel launch
         hsa_signal_t signal;
         status = hsa_signal_create(1, 0, NULL, &signal);
 
+//        hsa_memory_pool_t pool = av->get_hsa_kernarg_region();
+//        status = hsa_amd_memory_pool_allocate(pool, kernSize, 0, &kernArg);
+//        assert(status == HSA_STATUS_SUCCESS);
+
 /*
 Creating the packets
 */
@@ -230,17 +269,15 @@ Creating the packets
         dispatch_packet->grid_size_x = blockDimX * gridDimX;
         dispatch_packet->grid_size_y = blockDimY * gridDimY;
         dispatch_packet->grid_size_z = blockDimZ * gridDimZ;
-
         dispatch_packet->group_segment_size = 0;
         dispatch_packet->private_segment_size = sharedMemBytes;
         dispatch_packet->kernarg_address = kern;
-        dispatch_packet->kernel_object = f;
+        dispatch_packet->kernel_object = f.kernel;
         uint16_t header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
         (1 << HSA_PACKET_HEADER_BARRIER) |
         (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
         (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
 
-
         uint16_t setup = 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
         uint32_t header32 = header | (setup << 16);
 
@@ -250,5 +287,6 @@ Creating the packets
         hsa_signal_store_relaxed(Queue->doorbell_signal, packet_index);
         hsa_signal_value_t value = hsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
     }
+
     return ret;
 }

From 78b15bf062073bfc8e152cadc6cf267e4071608a Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Fri, 19 Aug 2016 21:48:23 -0500
Subject: [PATCH 02/56] Added support for complex device functions - Added
 complex number arithmetic operation for float and double datatypes - TODO:
 make them host functions and support half - Added new function which is not
 in CUDA, hipCsqabs which is square of absolute value

Change-Id: Ib96e194ad45dc64fcba29eb19ad0376542e0591d
---
 include/hcc_detail/hipComplex.h | 116 ++++++++++++++++++++++++++++++++
 tests/src/hipComplex.cpp        |  33 +++++++++
 2 files changed, 149 insertions(+)
 create mode 100644 include/hcc_detail/hipComplex.h
 create mode 100644 tests/src/hipComplex.cpp

diff --git a/include/hcc_detail/hipComplex.h b/include/hcc_detail/hipComplex.h
new file mode 100644
index 0000000000..c5584047bb
--- /dev/null
+++ b/include/hcc_detail/hipComplex.h
@@ -0,0 +1,116 @@
+#ifndef HIPCOMPLEX_H
+#define HIPCOMPLEX_H
+
+
+typedef struct{
+    float x;
+    float y;
+}hipFloatComplex;
+
+__device__ inline float hipCrealf(hipFloatComplex z){
+    return z.x;
+}
+
+__device__ inline float hipCimagf(hipFloatComplex z){
+    return z.y;
+}
+
+__device__ inline hipFloatComplex make_hipFloatComplex(float a, float b){
+    hipFloatComplex z;
+    z.x = a;
+    z.y = b;
+    return z;
+}
+
+__device__ inline hipFloatComplex hipConjf(hipFloatComplex z){
+    hipFloatComplex ret;
+    ret.x = z.x;
+    ret.y = z.y;
+    return ret;
+}
+
+__device__ inline float hipCsqabsf(hipFloatComplex z){
+    return z.x * z.x + z.y * z.y;
+}
+
+__device__ hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q){
+    return make_hipFloatComplex(p.x + q.x, p.y + q.y);
+}
+
+__device__ hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q){
+    return make_hipFloatComplex(p.x - q.x, p.y - q.y);
+}
+
+__device__ hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q){
+    return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
+}
+
+__device__ hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q){
+    float sqabs = hipCsqabsf(q);
+    hipFloatComplex ret;
+    ret.x = (p.x * q.x + p.y * q.y)/sqabs;
+    ret.y = (p.y * q.x - p.x * q.y)/sqabs;
+    return ret;
+}
+
+__device__ float hipCabsf(hipFloatComplex z){
+    return sqrtf(hipCsqabsf(z));
+}
+
+
+typedef struct{
+    double x;
+    double y;
+}hipDoubleComplex;
+
+__device__ inline double hipCreal(hipDoubleComplex z){
+    return z.x;
+}
+
+__device__ inline double hipCimag(hipDoubleComplex z){
+    return z.y;
+}
+
+__device__ inline hipDoubleComplex make_hipDoubleComplex(double a, double b){
+    hipDoubleComplex z;
+    z.x = a;
+    z.y = b;
+    return z;
+}
+
+__device__ inline hipDoubleComplex hipConj(hipDoubleComplex z){
+    hipDoubleComplex ret;
+    ret.x = z.x;
+    ret.y = z.y;
+    return ret;
+}
+
+__device__ inline double hipCsqabs(hipDoubleComplex z){
+    return z.x * z.x + z.y * z.y;
+}
+
+__device__ hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q){
+    return make_hipDoubleComplex(p.x + q.x, p.y + q.y);
+}
+
+__device__ hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q){
+    return make_hipDoubleComplex(p.x - q.x, p.y - q.y);
+}
+
+__device__ hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q){
+    return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
+}
+
+__device__ hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q){
+    double sqabs = hipCsqabs(q);
+    hipDoubleComplex ret;
+    ret.x = (p.x * q.x + p.y * q.y)/sqabs;
+    ret.y = (p.y * q.x - p.x * q.y)/sqabs;
+    return ret;
+}
+
+__device__ inline double hipCabs(hipDoubleComplex z){
+    return sqrtf(hipCsqabs(z));
+}
+
+#endif
diff --git a/tests/src/hipComplex.cpp b/tests/src/hipComplex.cpp
new file mode 100644
index 0000000000..bfa1a0be15
--- /dev/null
+++ b/tests/src/hipComplex.cpp
@@ -0,0 +1,33 @@
+#include<iostream>
+#include<hip/hip_runtime.h>
+#include<hip/hip_runtime_api.h>
+#include<hip/hcc_detail/hipComplex.h>
+
+#define LEN 64
+#define SIZE 64<<2
+
+__global__  void getSqAbs(hipLaunchParm lp, float *A, float *B, float *C){
+    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    C[tx] = hipCsqabsf(make_hipFloatComplex(A[tx], B[tx]));
+}
+
+int main(){
+    float *A, *Ad, *B, *Bd, *C, *Cd;
+    A = new float[LEN];
+    B = new float[LEN];
+    C = new float[LEN];
+    for(uint32_t i=0;i<LEN;i++){
+        A[i] = i*1.0f;
+        B[i] = i*1.0f;
+        C[i] = i*1.0f;
+    }
+
+    hipMalloc((void**)&Ad, SIZE);
+    hipMalloc((void**)&Bd, SIZE);
+    hipMalloc((void**)&Cd, SIZE);
+    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
+    hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
+    hipLaunchKernel(getSqAbs, dim3(1), dim3(LEN), 0, 0, Ad, Bd, Cd);
+    hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost);
+    std::cout<<A[11]<<" "<<B[11]<<" "<<C[11]<<std::endl;
+}

From 24f6251b997c70d4e499a95f2168c047438fac97 Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Fri, 19 Aug 2016 23:02:04 -0500
Subject: [PATCH 03/56] Added more complex apis and copyright - New header
 which redirects to CUDA/HIP path added for hipComplex.h - Added more complex
 device api including fma - Added copyright to new files

Change-Id: Iff0dece4c438e97d0ae33efa4312975d465a6464
---
 include/hcc_detail/hipComplex.h | 60 ++++++++++++++++++++++++++++++++-
 include/hipComplex.h            | 31 +++++++++++++++++
 tests/src/hipComplex.cpp        | 20 +++++++++++
 3 files changed, 110 insertions(+), 1 deletion(-)
 create mode 100644 include/hipComplex.h

diff --git a/include/hcc_detail/hipComplex.h b/include/hcc_detail/hipComplex.h
index c5584047bb..66d14d191e 100644
--- a/include/hcc_detail/hipComplex.h
+++ b/include/hcc_detail/hipComplex.h
@@ -1,7 +1,26 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
 #ifndef HIPCOMPLEX_H
 #define HIPCOMPLEX_H
 
-
 typedef struct{
     float x;
     float y;
@@ -113,4 +132,43 @@ __device__ inline double hipCabs(hipDoubleComplex z){
     return sqrtf(hipCsqabs(z));
 }
 
+typedef hipFloatComplex hipComplex;
+
+__device__ inline hipComplex make_hipComplex(float x,
+                                            float y){
+    return make_hipFloatComplex(x, y);
+}
+
+__device__ inline hipFloatComplex hipComplexDoubleToFloat
+(hipDoubleComplex z){
+    return make_hipFloatComplex((float)z.x, (float)z.y);
+}
+
+__device__ inline hipDoubleComplex hipComplexFloatToDouble
+(hipFloatComplex z){
+    return make_hipDoubleComplex((double)z.x, (double)z.y);
+}
+
+__device__ inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r){
+    float real = (p.x * q.x) + r.x;
+    float imag = (q.x * p.y) + r.y;
+
+    real = -(p.y * q.y) + real;
+    imag = (p.x * q.y) + imag;
+
+    return make_hipComplex(real, imag);
+}
+
+__device__ inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q, hipDoubleComplex r){
+    float real = (p.x * q.x) + r.x;
+    float imag = (q.x * p.y) + r.y;
+
+    real = -(p.y * q.y) + real;
+    imag = (p.x * q.y) + imag;
+
+    return make_hipDoubleComplex(real, imag);
+}
+
+
+
 #endif
diff --git a/include/hipComplex.h b/include/hipComplex.h
new file mode 100644
index 0000000000..0fdafb8263
--- /dev/null
+++ b/include/hipComplex.h
@@ -0,0 +1,31 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <hip/hip_common.h>
+
+#if defined(__HIP_PLATFORM_HCC__) && !defined (__HIP_PLATFORM_NVCC__)
+#include <hip/hcc_detail/hipComplex.h>
+#elif defined(__HIP_PLATFORM_NVCC__) && !defined (__HIP_PLATFORM_HCC__)
+#include "cuComplex.h"
+#else
+#error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
+#endif
+
diff --git a/tests/src/hipComplex.cpp b/tests/src/hipComplex.cpp
index bfa1a0be15..2e9c1884a1 100644
--- a/tests/src/hipComplex.cpp
+++ b/tests/src/hipComplex.cpp
@@ -1,3 +1,23 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
 #include<iostream>
 #include<hip/hip_runtime.h>
 #include<hip/hip_runtime_api.h>

From 4b5b15a8e529a15d23920fc211e20310cec623ac Mon Sep 17 00:00:00 2001
From: Rahul Garg <rahul.garg@amd.com>
Date: Mon, 22 Aug 2016 11:37:37 +0530
Subject: [PATCH 04/56] Added initial draft for performance optimizations,
 started with unpinned memory transfers

Change-Id: Icbce2aec347d015bc66cc0c08f6193057bf36b4c
---
 docs/markdown/hip_performance.md | 42 ++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 docs/markdown/hip_performance.md

diff --git a/docs/markdown/hip_performance.md b/docs/markdown/hip_performance.md
new file mode 100644
index 0000000000..98197b3db7
--- /dev/null
+++ b/docs/markdown/hip_performance.md
@@ -0,0 +1,42 @@
+# HIP Performance Optimizations
+
+Please note that this document lists possible ways for experimenting with HIP stack to gain performance. Performance may vary from platform to platform.
+
+### Unpinned Memory Transfer Optimizations
+ 
+#### On Small BAR Setup
+
+There are two possible ways to transfer data from Host to Device (H2D) and Device to Host(D2H)
+ * Using Staging Buffers
+ * Using PinInPlace
+
+#### On Large BAR Setup
+
+There are two possible ways to transfer data from Host to Device (H2D)
+ * Using Staging Buffers
+ * Using PinInPlace
+ * Direct Memcpy
+ 
+ And there are two possible ways to transfer data from Device to Host (D2H)
+ * Using Staging Buffers
+ * Using PinInPlace
+ 
+Some GPUs may not be able to directly access host memory, and in these cases we need to
+stage the copy through an optimized pinned staging buffer, to implement H2D and D2H copies.The copy is broken into buffer-sized chunks to limit the size of the buffer and also to provide better performance by overlapping the CPU copies with the DMA copies.
+
+PinInPlace is another algorithm which pins the host memory "in-place", and copies it with the DMA
+engine.
+
+By default staging buffers are used for unpinned memory transfers, however other ways can be used by enabling few environment variables (so no need to build the code again!!!)
+
+Following environment variables can be used:
+
+- HIP_PININPLACE - This environment variable forces the use of PinInPlace logic for all unpinned memory copies
+
+- HIP_OPTIMAL_MEM_TRANSFER- This environment variable enables a hybrid memory copy logic based on thresholds. These thresholds can be managed with following environment variables:
+  -   HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE - Threshold in bytes for H2D copy. For sizes smaller than threshold staging buffers logic would be used else PinInPlace logic.
+  -   HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING - Threshold in bytes for H2D copy. For sizes smaller than threshold direct copy logic would be used else staging buffers logic.
+  -   HIP_D2H_MEM_TRANSFER_THRESHOLD - Threshold in bytes for D2H copy. For sizes smaller than threshold staging buffer logic would be used else PinInPlace logic.
+
+
+

From a498753041593ffbebcaf20ccf69006e7fc29069 Mon Sep 17 00:00:00 2001
From: Rahul Garg <rahul.garg@amd.com>
Date: Mon, 22 Aug 2016 16:15:27 +0530
Subject: [PATCH 05/56] Added support for hipCtxSynchronize and
 hipCtxGetFlags,modified hipDeviceSynchronize

Change-Id: If7bac667a262fa8c0cb3dc93e97f2534855acd07
---
 include/hcc_detail/hip_hcc.h         |  1 +
 include/hcc_detail/hip_runtime_api.h |  5 +++++
 src/hip_context.cpp                  | 16 +++++++++++++++-
 src/hip_device.cpp                   |  6 +-----
 src/hip_hcc.cpp                      |  7 +++++++
 5 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/include/hcc_detail/hip_hcc.h b/include/hcc_detail/hip_hcc.h
index 2f0c33a0d2..de6613184f 100644
--- a/include/hcc_detail/hip_hcc.h
+++ b/include/hcc_detail/hip_hcc.h
@@ -648,6 +648,7 @@ extern void ihipInit();
 extern const char *ihipErrorString(hipError_t);
 extern ihipCtx_t    *ihipGetTlsDefaultCtx();
 extern void          ihipSetTlsDefaultCtx(ihipCtx_t *ctx);
+extern hipError_t    ihipSynchronize(void);
 
 extern ihipDevice_t *ihipGetDevice(int);
 ihipCtx_t * ihipGetPrimaryCtx(unsigned deviceIndex);
diff --git a/include/hcc_detail/hip_runtime_api.h b/include/hcc_detail/hip_runtime_api.h
index 9917619039..094549a3a1 100644
--- a/include/hcc_detail/hip_runtime_api.h
+++ b/include/hcc_detail/hip_runtime_api.h
@@ -1086,6 +1086,11 @@ hipError_t hipCtxSetSharedMemConfig ( hipSharedMemConfig config );
 
 hipError_t hipCtxGetSharedMemConfig ( hipSharedMemConfig * pConfig );
 
+hipError_t hipCtxSynchronize ( void );
+
+hipError_t hipCtxGetFlags ( unsigned int* flags );
+
+
 // TODO-ctx
 /**
  * @return hipSuccess, hipErrorInvalidDevice
diff --git a/src/hip_context.cpp b/src/hip_context.cpp
index ee9e37a1a1..33caf610bc 100644
--- a/src/hip_context.cpp
+++ b/src/hip_context.cpp
@@ -216,4 +216,18 @@ hipError_t hipCtxGetSharedMemConfig ( hipSharedMemConfig * pConfig )
     *pConfig = hipSharedMemBankSizeFourByte;
 
     return ihipLogStatus(hipSuccess);
-}
\ No newline at end of file
+}
+
+hipError_t hipCtxSynchronize ( void )
+{
+    return ihipSynchronize(); //TODP Shall check validity of ctx?
+}
+
+hipError_t hipCtxGetFlags ( unsigned int* flags )
+{
+    hipError_t e = hipSuccess;
+    ihipCtx_t* tempCtx;
+    tempCtx = ihipGetTlsDefaultCtx();
+    *flags = tempCtx->_ctxFlags;
+    return ihipLogStatus(e);
+}
diff --git a/src/hip_device.cpp b/src/hip_device.cpp
index e3d7fefa91..d31e3b5890 100644
--- a/src/hip_device.cpp
+++ b/src/hip_device.cpp
@@ -160,11 +160,7 @@ hipError_t hipSetDevice(int deviceId)
  */
 hipError_t hipDeviceSynchronize(void)
 {
-    HIP_INIT_API();
-
-    ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits for all activity to finish.
-
-    return ihipLogStatus(hipSuccess);
+    return ihipSynchronize();
 }
 
 
diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp
index f0d123b64a..e6b2905799 100644
--- a/src/hip_hcc.cpp
+++ b/src/hip_hcc.cpp
@@ -158,7 +158,14 @@ ihipCtx_t *ihipGetTlsDefaultCtx()
     return tls_defaultCtx;
 }
 
+hipError_t ihipSynchronize(void)
+{
+    HIP_INIT_API();
 
+    ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits for all activity to finish.
+
+    return ihipLogStatus(hipSuccess);
+}
 
 //=================================================================================================
 // ihipSignal_t:

From 0806958a72abaefb52a1015257a3fcc4bea8b46a Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Mon, 22 Aug 2016 10:29:46 -0500
Subject: [PATCH 06/56] Added nvcc path for hipComplex APIs - Changed from
 inline to static inline for hipComplex AMD APIs - Added NVCC path for
 hipComplex APIs mapped to cuComplex APIs

Change-Id: I809cf3a11b5b1c8bbc7a57c5fbcc3dc6745ccb95
---
 include/hcc_detail/hipComplex.h  |  54 ++++++++--------
 include/hipComplex.h             |   2 +-
 include/nvcc_detail/hipComplex.h | 108 +++++++++++++++++++++++++++++++
 3 files changed, 135 insertions(+), 29 deletions(-)
 create mode 100644 include/nvcc_detail/hipComplex.h

diff --git a/include/hcc_detail/hipComplex.h b/include/hcc_detail/hipComplex.h
index 66d14d191e..910cee946d 100644
--- a/include/hcc_detail/hipComplex.h
+++ b/include/hcc_detail/hipComplex.h
@@ -26,45 +26,45 @@ typedef struct{
     float y;
 }hipFloatComplex;
 
-__device__ inline float hipCrealf(hipFloatComplex z){
+__device__ static inline float hipCrealf(hipFloatComplex z){
     return z.x;
 }
 
-__device__ inline float hipCimagf(hipFloatComplex z){
+__device__ static inline float hipCimagf(hipFloatComplex z){
     return z.y;
 }
 
-__device__ inline hipFloatComplex make_hipFloatComplex(float a, float b){
+__device__ static inline hipFloatComplex make_hipFloatComplex(float a, float b){
     hipFloatComplex z;
     z.x = a;
     z.y = b;
     return z;
 }
 
-__device__ inline hipFloatComplex hipConjf(hipFloatComplex z){
+__device__ static inline hipFloatComplex hipConjf(hipFloatComplex z){
     hipFloatComplex ret;
     ret.x = z.x;
-    ret.y = z.y;
+    ret.y = -z.y;
     return ret;
 }
 
-__device__ inline float hipCsqabsf(hipFloatComplex z){
+__device__ static inline float hipCsqabsf(hipFloatComplex z){
     return z.x * z.x + z.y * z.y;
 }
 
-__device__ hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q){
+__device__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q){
     return make_hipFloatComplex(p.x + q.x, p.y + q.y);
 }
 
-__device__ hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q){
+__device__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q){
     return make_hipFloatComplex(p.x - q.x, p.y - q.y);
 }
 
-__device__ hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q){
+__device__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q){
     return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
 }
 
-__device__ hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q){
+__device__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q){
     float sqabs = hipCsqabsf(q);
     hipFloatComplex ret;
     ret.x = (p.x * q.x + p.y * q.y)/sqabs;
@@ -72,7 +72,7 @@ __device__ hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q){
     return ret;
 }
 
-__device__ float hipCabsf(hipFloatComplex z){
+__device__ static inline float hipCabsf(hipFloatComplex z){
     return sqrtf(hipCsqabsf(z));
 }
 
@@ -82,45 +82,45 @@ typedef struct{
     double y;
 }hipDoubleComplex;
 
-__device__ inline double hipCreal(hipDoubleComplex z){
+__device__ static inline double hipCreal(hipDoubleComplex z){
     return z.x;
 }
 
-__device__ inline double hipCimag(hipDoubleComplex z){
+__device__ static inline double hipCimag(hipDoubleComplex z){
     return z.y;
 }
 
-__device__ inline hipDoubleComplex make_hipDoubleComplex(double a, double b){
+__device__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b){
     hipDoubleComplex z;
     z.x = a;
     z.y = b;
     return z;
 }
 
-__device__ inline hipDoubleComplex hipConj(hipDoubleComplex z){
+__device__ static inline hipDoubleComplex hipConj(hipDoubleComplex z){
     hipDoubleComplex ret;
     ret.x = z.x;
     ret.y = z.y;
     return ret;
 }
 
-__device__ inline double hipCsqabs(hipDoubleComplex z){
+__device__ static inline double hipCsqabs(hipDoubleComplex z){
     return z.x * z.x + z.y * z.y;
 }
 
-__device__ hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q){
+__device__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q){
     return make_hipDoubleComplex(p.x + q.x, p.y + q.y);
 }
 
-__device__ hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q){
+__device__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q){
     return make_hipDoubleComplex(p.x - q.x, p.y - q.y);
 }
 
-__device__ hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q){
+__device__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q){
     return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
 }
 
-__device__ hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q){
+__device__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q){
     double sqabs = hipCsqabs(q);
     hipDoubleComplex ret;
     ret.x = (p.x * q.x + p.y * q.y)/sqabs;
@@ -128,28 +128,28 @@ __device__ hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q){
     return ret;
 }
 
-__device__ inline double hipCabs(hipDoubleComplex z){
+__device__ static inline double hipCabs(hipDoubleComplex z){
     return sqrtf(hipCsqabs(z));
 }
 
 typedef hipFloatComplex hipComplex;
 
-__device__ inline hipComplex make_hipComplex(float x,
+__device__ static inline hipComplex make_hipComplex(float x,
                                             float y){
     return make_hipFloatComplex(x, y);
 }
 
-__device__ inline hipFloatComplex hipComplexDoubleToFloat
+__device__ static inline hipFloatComplex hipComplexDoubleToFloat
 (hipDoubleComplex z){
     return make_hipFloatComplex((float)z.x, (float)z.y);
 }
 
-__device__ inline hipDoubleComplex hipComplexFloatToDouble
+__device__ static inline hipDoubleComplex hipComplexFloatToDouble
 (hipFloatComplex z){
     return make_hipDoubleComplex((double)z.x, (double)z.y);
 }
 
-__device__ inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r){
+__device__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r){
     float real = (p.x * q.x) + r.x;
     float imag = (q.x * p.y) + r.y;
 
@@ -159,7 +159,7 @@ __device__ inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r){
     return make_hipComplex(real, imag);
 }
 
-__device__ inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q, hipDoubleComplex r){
+__device__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q, hipDoubleComplex r){
     float real = (p.x * q.x) + r.x;
     float imag = (q.x * p.y) + r.y;
 
@@ -169,6 +169,4 @@ __device__ inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex
     return make_hipDoubleComplex(real, imag);
 }
 
-
-
 #endif
diff --git a/include/hipComplex.h b/include/hipComplex.h
index 0fdafb8263..27281a2df4 100644
--- a/include/hipComplex.h
+++ b/include/hipComplex.h
@@ -24,7 +24,7 @@ THE SOFTWARE.
 #if defined(__HIP_PLATFORM_HCC__) && !defined (__HIP_PLATFORM_NVCC__)
 #include <hip/hcc_detail/hipComplex.h>
 #elif defined(__HIP_PLATFORM_NVCC__) && !defined (__HIP_PLATFORM_HCC__)
-#include "cuComplex.h"
+#include <hip/nvcc_detail/hipComplex.h>
 #else
 #error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
 #endif
diff --git a/include/nvcc_detail/hipComplex.h b/include/nvcc_detail/hipComplex.h
new file mode 100644
index 0000000000..b5c182bd4d
--- /dev/null
+++ b/include/nvcc_detail/hipComplex.h
@@ -0,0 +1,108 @@
+#ifndef HIPCOMPLEX_H
+#define HIPCOMPLEX_H
+
+#include"cuComplex.h"
+
+typedef cuFloatComplex hipFloatComplex;
+
+__device__ __host__ static inline float hipCrealf(hipFloatComplex z){
+    return cuCrealf(z);
+}
+
+__device__ __host__ static inline float hipCimagf(hipFloatComplex z){
+    return cuCimagf(z);
+}
+
+__device__ __host__ static inline hipFloatComplex make_hipFloatComplex(float a, float b){
+    return make_cuFloatComplex(a, b);
+}
+
+__device__ __host__ static inline hipFloatComplex hipConjf(hipFloatComplex z){
+    return cuConjf(z);
+}
+
+__device__ __host__ static inline float hipCsqabsf(hipFloatComplex z){
+    return cuCabsf(z) * cuCabsf(z);
+}
+
+__device__ __host__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q){
+    return cuCaddf(p, q);
+}
+
+__device__ __host__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q){
+    return cuCsubf(p, q);
+}
+
+__device__ __host__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q){
+    return cuCmulf(p, q);
+}
+
+__device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q){
+    return cuCdivf(p, q);
+}
+
+__device__ __host__ static inline float hipCabsf(hipFloatComplex z){
+    return cuCabsf(p, q);
+}
+
+typedef cuDoubleComplex hipDoubleComplex;
+
+__device__ __host__ static inline double hipCreal(hipDoubleComplex z){
+    return cuCreal(z);
+}
+
+__device__ __host__ static inline double hipCimag(hipDoubleComplex z){
+    return cuCimag(z);
+}
+
+__device__ __host__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b){
+    return make_cuDoubleComplex(a, b);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z){
+    return cuConj(z);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCsqabs(hipDoubleComplex z){
+    return cuCabs(z) * cuCabs(z);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q){
+    return cuCadd(p, q);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q){
+    return cuCsub(p, q);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q){
+    return cuCdiv(p, q);
+}
+
+__device__ __host__ static inline double hipCabs(hipDoubleComplex z){
+    return cuCabs(z);
+}
+
+typedef cuFloatComplex hipComplex;
+
+__device__ __host__ static inline hipComplex make_Complex(float x, float y){
+    return make_cuComplex(x, y);
+}
+
+__device__ __host__ static inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z){
+    return cuComplexDoubleToFloat(z);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z){
+    return cuComplexFloatToDouble(z);
+}
+
+__device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r){
+    return cuCfmaf(p, q, r);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCfma(hipComplex p, hipComplex q, hipComplex r){
+    return cuCfma(p, q, r);
+}
+
+#endif

From 8f0f97f8f9b988eedaa97042763244e7d3cd04c3 Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Mon, 22 Aug 2016 14:17:55 -0500
Subject: [PATCH 07/56] Added stream synchronisation for hipLaunchModuleKernel
 - The module kernel launch is now in sync with commands in its stream - Moved
 launch kernel inside ihipStream

Change-Id: Ic00cfcf4882bf81b6203c36881a52575ea68b529
---
 include/hcc_detail/hip_hcc.h |  3 +-
 src/hip_hcc.cpp              | 47 +++++++++++++++++++
 src/hip_module.cpp           | 88 ++++++------------------------------
 3 files changed, 64 insertions(+), 74 deletions(-)

diff --git a/include/hcc_detail/hip_hcc.h b/include/hcc_detail/hip_hcc.h
index de6613184f..516c62a79b 100644
--- a/include/hcc_detail/hip_hcc.h
+++ b/include/hcc_detail/hip_hcc.h
@@ -423,7 +423,7 @@ typedef uint64_t SeqNum_t ;
     // Use this if we already have the stream critical data mutex:
     void                 wait(LockedAccessor_StreamCrit_t &crit, bool assertQueueEmpty=false);
 
-
+    void launchModuleKernel(hsa_signal_t signal, uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ, uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ, uint32_t sharedMemBytes, void *kernarg, size_t kernSize, uint64_t kernel);
 
     // Non-threadsafe accessors - must be protected by high-level stream lock with accessor passed to function.
     SIGSEQNUM            lastCopySeqId (LockedAccessor_StreamCrit_t &crit) const { return crit->_last_copy_signal ? crit->_last_copy_signal->_sigId : 0; };
@@ -443,6 +443,7 @@ public:
     hc::accelerator_view        _av;
     unsigned                    _flags;
 
+
 private:
     void     enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal, ihipSignal_t *completionSignal);
     void     waitCopy(LockedAccessor_StreamCrit_t &crit, ihipSignal_t *signal);
diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp
index e6b2905799..4508b6057a 100644
--- a/src/hip_hcc.cpp
+++ b/src/hip_hcc.cpp
@@ -489,6 +489,53 @@ int ihipStream_t::preCopyCommand(LockedAccessor_StreamCrit_t &crit, ihipSignal_t
 }
 
 
+void ihipStream_t::launchModuleKernel(hsa_signal_t signal,
+                        uint32_t blockDimX,
+                        uint32_t blockDimY,
+                        uint32_t blockDimZ,
+                        uint32_t gridDimX,
+                        uint32_t gridDimY,
+                        uint32_t gridDimZ,
+                        uint32_t sharedMemBytes,
+                        void *kernarg,
+                        size_t kernSize,
+                        uint64_t kernel){
+    hsa_status_t status;
+    void *kern;
+    hsa_amd_memory_pool_t *pool = reinterpret_cast<hsa_amd_memory_pool_t*>(_av.get_hsa_kernarg_region());
+    status = hsa_amd_memory_pool_allocate(*pool, kernSize, 0, &kern);
+    status = hsa_amd_agents_allow_access(1, (hsa_agent_t*)_av.get_hsa_agent(), 0, kern);
+    memcpy(kern, kernarg, kernSize);
+    hsa_queue_t *Queue = (hsa_queue_t*)_av.get_hsa_queue();
+    const uint32_t queue_mask = Queue->size-1;
+    uint32_t packet_index = hsa_queue_load_write_index_relaxed(Queue);
+    hsa_kernel_dispatch_packet_t *dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(Queue->base_address))[packet_index & queue_mask]);
+
+    dispatch_packet->completion_signal = signal;
+    dispatch_packet->workgroup_size_x = blockDimX;
+    dispatch_packet->workgroup_size_y = blockDimY;
+    dispatch_packet->workgroup_size_z = blockDimZ;
+    dispatch_packet->grid_size_x = blockDimX * gridDimX;
+    dispatch_packet->grid_size_y = blockDimY * gridDimY;
+    dispatch_packet->grid_size_z = blockDimZ * gridDimZ;
+    dispatch_packet->group_segment_size = 0;
+    dispatch_packet->private_segment_size = sharedMemBytes;
+    dispatch_packet->kernarg_address = kern;
+    dispatch_packet->kernel_object = kernel;
+    uint16_t header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
+    (1 << HSA_PACKET_HEADER_BARRIER) |
+    (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
+    (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
+
+    uint16_t setup = 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
+    uint32_t header32 = header | (setup << 16);
+
+    __atomic_store_n((uint32_t*)(dispatch_packet), header32, __ATOMIC_RELEASE);
+
+    hsa_queue_store_write_index_relaxed(Queue, packet_index + 1);
+    hsa_signal_store_relaxed(Queue->doorbell_signal, packet_index);
+}
+
 
 //=============================================================================
 // Recompute the peercnt and the packed _peerAgents whenever a peer is added or deleted.
diff --git a/src/hip_module.cpp b/src/hip_module.cpp
index 48dc6574a0..f15dca7c59 100644
--- a/src/hip_module.cpp
+++ b/src/hip_module.cpp
@@ -49,26 +49,6 @@ namespace hipdrv{
         return HSA_STATUS_SUCCESS;
     }
 
-    hsa_status_t findKernArgRegions(hsa_region_t region, void *data){
-        hsa_region_segment_t segment_id;
-        hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment_id);
-
-        if(segment_id != HSA_REGION_SEGMENT_GLOBAL){
-            return HSA_STATUS_SUCCESS;
-        }
-
-        hsa_region_global_flag_t flags;
-        hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
-
-        hsa_region_t *reg = (hsa_region_t*)data;
-
-        if(flags & HSA_REGION_GLOBAL_FLAG_KERNARG){
-            *reg = region;
-        }
-
-        return HSA_STATUS_SUCCESS;
-    }
-
 }   // End namespace hipdrv
 
 
@@ -100,6 +80,7 @@ hipError_t hipModuleLoad(hipModule *module, const char *fname){
             hsa_region_t sysRegion;
             hsa_status_t status = hsa_agent_iterate_regions(agent, hipdrv::findSystemRegions, &sysRegion);
             status = hsa_memory_allocate(sysRegion, size, (void**)&p);
+
             if(status != HSA_STATUS_SUCCESS){
                 return hipErrorOutOfMemory;
             }
@@ -221,71 +202,32 @@ hipError_t hipLaunchModuleKernel(hipFunction f,
 /*
 Kernel argument preparation.
 */
-
-        hsa_region_t kernArg;
-        hsa_status_t status = hsa_agent_iterate_regions(gpuAgent, hipdrv::findKernArgRegions, &kernArg);
-        void *kern;
-        status = hsa_memory_allocate(kernArg, kernSize, &kern);
-
-        if(status != HSA_STATUS_SUCCESS){
-            return hipErrorLaunchOutOfResources;
-        }
-
-        memcpy(kern, config[1], kernSize);
-
-
+        hsa_status_t status;
+        grid_launch_parm lp;
+        hStream = ihipPreLaunchKernel(hStream, 0, 0, &lp);
 
 /*
-Pre kernel launch 
-
-    stream = ihipSyncAndResolveStream(stream);
-    stream->lockopen_preKernelCommand();
-    hc::accelerator_view av = &stream->_av;
-    hc::completion_future cf = new hc::completion_future;
+  Create signal
 */
 
-        hStream = ihipSyncAndResolveStream(hStream);
-        hc::accelerator_view *av = &hStream->_av;
-        hsa_queue_t *Queue = (hsa_queue_t*)av->get_hsa_queue();
         hsa_signal_t signal;
         status = hsa_signal_create(1, 0, NULL, &signal);
 
-//        hsa_memory_pool_t pool = av->get_hsa_kernarg_region();
-//        status = hsa_amd_memory_pool_allocate(pool, kernSize, 0, &kernArg);
-//        assert(status == HSA_STATUS_SUCCESS);
+/*
+  Launch AQL packet
+*/
+        hStream->launchModuleKernel(signal, blockDimX, blockDimY, blockDimZ,
+                  gridDimX, gridDimY, gridDimZ, sharedMemBytes, config[1], kernSize, f.kernel);
 
 /*
-Creating the packets
+  Wait for signal
 */
 
-        const uint32_t queue_mask = Queue->size-1;
-        uint32_t packet_index = hsa_queue_load_write_index_relaxed(Queue);
-        hsa_kernel_dispatch_packet_t *dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(Queue->base_address))[packet_index & queue_mask]);
-
-        dispatch_packet->completion_signal = signal;
-        dispatch_packet->workgroup_size_x = blockDimX;
-        dispatch_packet->workgroup_size_y = blockDimY;
-        dispatch_packet->workgroup_size_z = blockDimZ;
-        dispatch_packet->grid_size_x = blockDimX * gridDimX;
-        dispatch_packet->grid_size_y = blockDimY * gridDimY;
-        dispatch_packet->grid_size_z = blockDimZ * gridDimZ;
-        dispatch_packet->group_segment_size = 0;
-        dispatch_packet->private_segment_size = sharedMemBytes;
-        dispatch_packet->kernarg_address = kern;
-        dispatch_packet->kernel_object = f.kernel;
-        uint16_t header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
-        (1 << HSA_PACKET_HEADER_BARRIER) |
-        (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
-        (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
-
-        uint16_t setup = 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
-        uint32_t header32 = header | (setup << 16);
-
-        __atomic_store_n((uint32_t*)(dispatch_packet), header32, __ATOMIC_RELEASE);
-
-        hsa_queue_store_write_index_relaxed(Queue, packet_index+1);
-        hsa_signal_store_relaxed(Queue->doorbell_signal, packet_index);
         hsa_signal_value_t value = hsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
+
+
+        ihipPostLaunchKernel(hStream, lp);
+                
     }
 
     return ret;

From f9d49c2aed2e67f016c86262b76b72e569450628 Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Tue, 23 Aug 2016 13:50:19 -0500
Subject: [PATCH 08/56] Added module api test with gcn binary

Change-Id: I61c3ecc2b34168d10f1a7b15d668630eb2c69c8c
---
 tests/src/hipModule.cpp | 257 ++++++++++++++++++++++++++++++++++++++++
 tests/src/vcpy_isa.co   | Bin 0 -> 9416 bytes
 2 files changed, 257 insertions(+)
 create mode 100644 tests/src/hipModule.cpp
 create mode 100755 tests/src/vcpy_isa.co

diff --git a/tests/src/hipModule.cpp b/tests/src/hipModule.cpp
new file mode 100644
index 0000000000..8397b5f338
--- /dev/null
+++ b/tests/src/hipModule.cpp
@@ -0,0 +1,257 @@
+#include<hip_runtime.h>
+#include<hip_runtime_api.h>
+#include<iostream>
+#include<fstream>
+#include<hsa/hsa.h>
+#include<hsa/amd_hsa_kernel_code.h>
+#include<hsa/hsa_ven_amd_loader.h>
+#include<vector>
+
+#define LEN 64
+#define SIZE LEN<<2
+
+typedef hsa_code_object_t hipmodule;
+typedef uint64_t hipfunction;
+typedef unsigned int hipDevicePtr;
+
+hsa_region_t systemRegion;
+hsa_region_t kernArgRegion;
+hsa_agent_t gpuAgent;
+hsa_queue_t *Queue;
+hsa_signal_t signal;
+
+hsa_status_t findGpu(hsa_agent_t agent, void *data){
+  hsa_device_type_t device_type;
+  hsa_status_t hsa_error_code = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type);
+  if(hsa_error_code != HSA_STATUS_SUCCESS){return hsa_error_code;}
+  if(device_type == HSA_DEVICE_TYPE_GPU){
+    gpuAgent = agent;
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t FindRegions(hsa_region_t region, void *data){
+  hsa_region_segment_t segment_id;
+  hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment_id);
+
+  if (segment_id != HSA_REGION_SEGMENT_GLOBAL) {
+    return HSA_STATUS_SUCCESS;
+  }
+
+  hsa_region_global_flag_t flags;
+  hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
+
+  if(flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED){
+    systemRegion = region;
+  }
+
+  if(flags & HSA_REGION_GLOBAL_FLAG_KERNARG){
+    kernArgRegion = region;
+  }
+  return HSA_STATUS_SUCCESS;
+}
+
+hipError_t ihipModuleLoad(hipmodule *module, const char *fname){
+  std::ifstream in(fname, std::ios::binary | std::ios::ate);
+  hipError_t ret = hipSuccess;
+  if(!in){
+    std::cout<<"Couldn't read file "<<fname<<std::endl;
+    ret = hipErrorInvalidValue;
+  }else{
+    size_t size = std::string::size_type(in.tellg());
+    void *p = NULL;
+    hsa_status_t status = hsa_memory_allocate(systemRegion, size, (void**)&p);
+    assert(status == HSA_STATUS_SUCCESS);
+    char *ptr = (char*)p;
+    if(!ptr){
+      std::cout<<"Error: failed to allocate memory for code object."<<std::endl;
+    }
+    in.seekg(0, std::ios::beg);
+    std::copy(std::istreambuf_iterator<char>(in),
+              std::istreambuf_iterator<char>(), ptr);
+    status = hsa_code_object_deserialize(ptr, size, NULL, module);
+    if (status != HSA_STATUS_SUCCESS) { std::cout<<"Failed to deserialize code object"<<std::endl; }
+    std::cout<<"File Read!"<<std::endl;
+  }
+  return ret;
+}
+
+hipError_t ihipModuleGetFunction(hipfunction *func, hipmodule hmod, const char *name){
+  hsa_status_t status;
+  hsa_executable_symbol_t kernel_symbol;
+  hsa_executable_t executable;
+  status = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, NULL, &executable);
+  assert(status == HSA_STATUS_SUCCESS);
+  status = hsa_executable_load_code_object(executable, gpuAgent, hmod, NULL);
+  assert(status == HSA_STATUS_SUCCESS);
+  status = hsa_executable_freeze(executable, NULL);
+  assert(status == HSA_STATUS_SUCCESS);
+  status = hsa_executable_get_symbol(executable, NULL, name, gpuAgent, 0, &kernel_symbol);
+  assert(status == HSA_STATUS_SUCCESS);
+  status = hsa_executable_symbol_get_info(kernel_symbol,
+                    HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
+                    func);
+  assert(status == HSA_STATUS_SUCCESS);
+  return hipSuccess;
+}
+
+void hsaInit(){
+  hsa_status_t status;
+  uint32_t queue_size;
+  status = hsa_iterate_agents(findGpu, NULL);
+  assert(status == HSA_STATUS_SUCCESS);
+  status = hsa_agent_iterate_regions(gpuAgent, FindRegions, NULL);
+  assert(status == HSA_STATUS_SUCCESS);
+  char agent_name[64];
+  status = hsa_agent_get_info(gpuAgent, HSA_AGENT_INFO_NAME, agent_name);
+  assert(status == HSA_STATUS_SUCCESS);
+  status = hsa_agent_get_info(gpuAgent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size);
+  assert(status == HSA_STATUS_SUCCESS);
+  status = hsa_queue_create(gpuAgent, queue_size, HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT32_MAX, UINT32_MAX, &Queue);
+  assert(status == HSA_STATUS_SUCCESS);
+  status = hsa_signal_create(1, 0, NULL, &signal);
+  assert(status == HSA_STATUS_SUCCESS);
+  std::cout<<agent_name<<std::endl;
+}
+
+hipError_t hipDrvLaunchKernel(hipfunction f,
+  uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ,
+  uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
+  uint32_t sharedMemBytes, hipStream_t hStream,
+  void **kernelParams, uint32_t size, void **extra){
+    void *kernarg;
+    std::vector<void*>Args;
+    void ***newP = (void***)kernelParams;
+    for(uint32_t i=0;i<size/sizeof(void*);i++){
+      Args.push_back((void*)*newP[i]);
+    }
+
+    hsa_status_t status = hsa_memory_allocate(kernArgRegion, size, &kernarg);
+    memcpy(kernarg, Args.data(), size);
+    const uint32_t queue_mask = Queue->size -1;
+    uint32_t packet_index = hsa_queue_load_write_index_relaxed(Queue);
+    hsa_kernel_dispatch_packet_t *dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(Queue->base_address))[packet_index & queue_mask]);
+    dispatch_packet->completion_signal = signal;
+    dispatch_packet->workgroup_size_x = blockDimX;
+    dispatch_packet->workgroup_size_y = blockDimY;
+    dispatch_packet->workgroup_size_z = blockDimZ;
+    dispatch_packet->grid_size_x = blockDimX * gridDimX;
+    dispatch_packet->grid_size_y = blockDimY * gridDimY;
+    dispatch_packet->grid_size_z = blockDimZ * gridDimZ;
+    dispatch_packet->group_segment_size = 0;
+    dispatch_packet->private_segment_size = sharedMemBytes;
+    dispatch_packet->kernarg_address = kernarg;
+    dispatch_packet->kernel_object = (uint64_t)f;
+    uint16_t header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
+    (1 << HSA_PACKET_HEADER_BARRIER) |
+    (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
+    (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
+    uint16_t setup = 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
+    uint32_t header32 = header | (setup << 16);
+    __atomic_store_n((uint32_t*)(dispatch_packet), header32, __ATOMIC_RELEASE);
+    hsa_queue_store_write_index_relaxed(Queue, packet_index+1);
+    hsa_signal_store_relaxed(Queue->doorbell_signal, packet_index);
+    hsa_signal_value_t value = hsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
+    return hipSuccess;
+}
+
+#define fileName "vcpy_isa.co"
+#define kernel_name "hello_world"
+
+__global__ void Cpy(hipLaunchParm lp, float *Ad, float* Bd){
+  int tx = hipThreadIdx_x;
+  Bd[tx] = Ad[tx];
+}
+
+amd_kernel_code_t* getAkc(uint64_t handle){
+    bool ext_supported = false;
+    hsa_status_t status = hsa_system_extension_supported(
+                HSA_EXTENSION_AMD_LOADER, 1, 0, &ext_supported);
+    assert(HSA_STATUS_SUCCESS == status);
+    assert(true == ext_supported);
+    hsa_ven_amd_loader_1_00_pfn_t ext_table = {nullptr};
+    status = hsa_system_get_extension_table(
+        HSA_EXTENSION_AMD_LOADER, 1, 0, &ext_table);
+    assert(HSA_STATUS_SUCCESS == status);
+    assert(nullptr != ext_table.hsa_ven_amd_loader_query_host_address);
+    std::cout<<"Start"<<std::endl;
+    const void *akc = nullptr;//new amd_kernel_code_t;
+    std::cout<<"End"<<std::endl;
+    status = ext_table.hsa_ven_amd_loader_query_host_address(
+      reinterpret_cast<void*>(handle), &akc);
+
+    if(HSA_STATUS_SUCCESS != status){
+        akc = reinterpret_cast<void*>(handle);
+    }
+
+    assert(nullptr!=akc);
+    amd_kernel_code_t *Akc = (amd_kernel_code_t*)akc;
+    std::cout<<Akc->kernarg_segment_byte_size<<std::endl;
+    return nullptr;
+}
+
+int main(){
+  float *A, *B, *Ad, *Bd;
+  hipDevicePtr Aptr, Bptr;
+  A = new float[LEN];
+  B = new float[LEN];
+
+  for(uint32_t i=0;i<LEN;i++){
+    A[i] = i*1.0f;
+    B[i] = 0.0f;
+    std::cout<<A[i] << " "<<B[i]<<std::endl;
+  }
+
+  hipMalloc((void**)&Ad, SIZE);
+  hipMalloc((void**)&Bd, SIZE);
+  Aptr = reinterpret_cast<unsigned long>(Ad);
+  Bptr = reinterpret_cast<unsigned long>(Bd);
+  hsaInit();
+
+  hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
+  hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
+  hipModule Module;
+  hipFunction Function;
+  hipModuleLoad(&Module, fileName);
+  hipModuleGetFunction(&Function, Module, kernel_name);
+  hipStream_t stream;
+  hipStreamCreate(&stream);
+  void *args[2] = {&Ad, &Bd};
+
+/*  struct __attribute__((aligned(16))) args_t{
+    void *Aptr;
+    void *Bptr;
+  } args;
+  args.Aptr = Ad;
+  args.Bptr = Bd;
+*/
+//  hipDrvLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, (void**)&args, sizeof(args), 0);
+
+    amd_kernel_code_t *akc = getAkc(Function.kernel);
+
+    std::vector<void*>argBuffer(2);
+    memcpy(&argBuffer[0], &Ad, sizeof(void*));
+    memcpy(&argBuffer[1], &Bd, sizeof(void*));
+
+    size_t size = argBuffer.size()*sizeof(void*);
+
+    void *config[] = {
+      HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0],
+      HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
+      HIP_LAUNCH_PARAM_END
+    };
+
+    hipLaunchModuleKernel(Function, 1, 1, 1, LEN, 1, 1, 0, stream, NULL, (void**)&config); 
+
+    hipStreamDestroy(stream);
+
+//  hipLaunchKernel(Cpy, dim3(1), dim3(LEN), 0, 0, Ad, Bd);
+ hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
+
+  for(uint32_t i=0;i<LEN;i++){
+  std::cout<<A[i]<<" - "<<B[i]<<std::endl;
+  }
+
+  return 0;
+}
diff --git a/tests/src/vcpy_isa.co b/tests/src/vcpy_isa.co
new file mode 100755
index 0000000000000000000000000000000000000000..96105ee74b7b31c85f79b405cbb7ab44804b1a55
GIT binary patch
literal 9416
zcmeHNJ#Q015S=?8HX(^f1qc-BLO~Us5S2??IDAM183YBDeb@$D_#?6}f)peoRf<Ub
z1R5mx7o?=7PC-LYNy*IazRUS=1Sly$nw9Qm=FQCB?%Qo<_jcCq-SR!JLar6)D|s+j
zR-C@Kas*bWOeOX#(mWLz4*01!cV6zf&LGdaIDsYaxVG$D4<s#FTk_6`>g>F%yYE55
zFR+Z2LV0lu%{!G%XF?)RhVeIfnBT*1N!!dXbRFkp)4`B#*z|z&%=euCq(b_=EKEDT
zz>CaF9(*_bPPfzVj~XvW<9<8ER8a<$0cAiLPzIC%Wk4BF29yD1KpFV&89@E#kyNgQ
z83p0=6sxpK`wY#|xhV(AaRC=>g`WC@AUMsEHtqK`6rYL*($72cuM)levddNRE)@@i
zjUC$m#HSQDJioZ@m5ODe`XSfMfzMdMr)|$GmN!hy#-VSD{Wl!1O&L%IlmTTx8Bhk4
z0cAiLPzIC%Wk4DD_ZUE7{@EP_%)238a?;$jv%Vyk#oSQ!FIqFb!~cPU1cyUNSuC}*
zxU1{6^v+rEEBx?@gTVv-qX2j3z&QSP{jua8(>dpwarivSkc~`}j9K^I6t%a8ldS<o
z-FVWqRx+k2>AXse6%TqXiiV@4Ly;XoG~OI0y+Nlj=p=DFPU6(j8V#TIo*$VL>*el~
z_$ft`?j#vo%}UB%t~Y8=?pN3E-pma81ggwe*3K;xVLc(QyS{~6i3QhK=Q+k?t&xxQ
zghZU=N;AhD8F`*{=^OdjFOb+T9AX;!fqjt`!LT3kzXBxwTR?n%ODwxw8G~uzv}`&P
ze#{@|R@c~Haxq!a46Rkmf~;fyfLB?`&q;jmLwQ?ntY+W&*foapm_&XE<^5~R$3Eha
zJSJIxn&0ABtexbceGIL3#{mm0<vZXH`{D&-;MRp2@jbNG>nzB6eyp2}&B`CF*u)FJ
G$^Qui<Y~G9

literal 0
HcmV?d00001


From 2287af23a1f3086fe100bc2d0077a49b0dfd51fd Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Tue, 23 Aug 2016 14:19:15 -0500
Subject: [PATCH 09/56] Module test correction and hipModuleUnload API -
 Corrected the hipModule.cpp test to minimal code - Added hipModuleUnload API
 - Added hipModuleUnload API test

Change-Id: I9c40337043d7972a570b795e1bfc104bd2c4d8aa
---
 include/hcc_detail/hip_runtime_api.h |   2 +
 src/hip_module.cpp                   |  13 ++
 tests/src/hipModule.cpp              | 207 +++------------------------
 tests/src/hipModuleUnload.cpp        |  31 ++++
 4 files changed, 65 insertions(+), 188 deletions(-)
 create mode 100644 tests/src/hipModuleUnload.cpp

diff --git a/include/hcc_detail/hip_runtime_api.h b/include/hcc_detail/hip_runtime_api.h
index 094549a3a1..7afd499041 100644
--- a/include/hcc_detail/hip_runtime_api.h
+++ b/include/hcc_detail/hip_runtime_api.h
@@ -1113,6 +1113,8 @@ hipError_t hipDriverGetVersion(int *driverVersion) ;
 
 hipError_t hipModuleLoad(hipModule *module, const char *fname);
 
+hipError_t hipModuleUnload(hipModule module);
+
 hipError_t hipModuleGetFunction(hipFunction *function, hipModule module, const char *kname);
 
 hipError_t hipLaunchModuleKernel(hipFunction f,
diff --git a/src/hip_module.cpp b/src/hip_module.cpp
index f15dca7c59..6227ce750f 100644
--- a/src/hip_module.cpp
+++ b/src/hip_module.cpp
@@ -116,6 +116,19 @@ hipError_t hipModuleLoad(hipModule *module, const char *fname){
     return ret;
 }
 
+hipError_t hipModuleUnload(hipModule hmod){
+    hsa_executable_t exec;
+    hsa_code_object_t co;
+    hipError_t ret = hipSuccess;
+    exec.handle = hmod.executable;
+    co.handle = hmod.object;
+    hsa_status_t status = hsa_executable_destroy(exec);
+    if(status != HSA_STATUS_SUCCESS){ret = hipErrorInvalidValue; }
+    status = hsa_code_object_destroy(co);
+    if(status != HSA_STATUS_SUCCESS){ret = hipErrorInvalidValue; }
+    return ret;
+}
+
 hipError_t hipModuleGetFunction(hipFunction *func, hipModule hmod, const char *name){
     HIP_INIT_API(name);
     auto ctx = ihipGetTlsDefaultCtx();
diff --git a/tests/src/hipModule.cpp b/tests/src/hipModule.cpp
index 8397b5f338..784a01c1d0 100644
--- a/tests/src/hipModule.cpp
+++ b/tests/src/hipModule.cpp
@@ -1,3 +1,22 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
 #include<hip_runtime.h>
 #include<hip_runtime_api.h>
 #include<iostream>
@@ -10,152 +29,6 @@
 #define LEN 64
 #define SIZE LEN<<2
 
-typedef hsa_code_object_t hipmodule;
-typedef uint64_t hipfunction;
-typedef unsigned int hipDevicePtr;
-
-hsa_region_t systemRegion;
-hsa_region_t kernArgRegion;
-hsa_agent_t gpuAgent;
-hsa_queue_t *Queue;
-hsa_signal_t signal;
-
-hsa_status_t findGpu(hsa_agent_t agent, void *data){
-  hsa_device_type_t device_type;
-  hsa_status_t hsa_error_code = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type);
-  if(hsa_error_code != HSA_STATUS_SUCCESS){return hsa_error_code;}
-  if(device_type == HSA_DEVICE_TYPE_GPU){
-    gpuAgent = agent;
-  }
-
-  return HSA_STATUS_SUCCESS;
-}
-
-hsa_status_t FindRegions(hsa_region_t region, void *data){
-  hsa_region_segment_t segment_id;
-  hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment_id);
-
-  if (segment_id != HSA_REGION_SEGMENT_GLOBAL) {
-    return HSA_STATUS_SUCCESS;
-  }
-
-  hsa_region_global_flag_t flags;
-  hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
-
-  if(flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED){
-    systemRegion = region;
-  }
-
-  if(flags & HSA_REGION_GLOBAL_FLAG_KERNARG){
-    kernArgRegion = region;
-  }
-  return HSA_STATUS_SUCCESS;
-}
-
-hipError_t ihipModuleLoad(hipmodule *module, const char *fname){
-  std::ifstream in(fname, std::ios::binary | std::ios::ate);
-  hipError_t ret = hipSuccess;
-  if(!in){
-    std::cout<<"Couldn't read file "<<fname<<std::endl;
-    ret = hipErrorInvalidValue;
-  }else{
-    size_t size = std::string::size_type(in.tellg());
-    void *p = NULL;
-    hsa_status_t status = hsa_memory_allocate(systemRegion, size, (void**)&p);
-    assert(status == HSA_STATUS_SUCCESS);
-    char *ptr = (char*)p;
-    if(!ptr){
-      std::cout<<"Error: failed to allocate memory for code object."<<std::endl;
-    }
-    in.seekg(0, std::ios::beg);
-    std::copy(std::istreambuf_iterator<char>(in),
-              std::istreambuf_iterator<char>(), ptr);
-    status = hsa_code_object_deserialize(ptr, size, NULL, module);
-    if (status != HSA_STATUS_SUCCESS) { std::cout<<"Failed to deserialize code object"<<std::endl; }
-    std::cout<<"File Read!"<<std::endl;
-  }
-  return ret;
-}
-
-hipError_t ihipModuleGetFunction(hipfunction *func, hipmodule hmod, const char *name){
-  hsa_status_t status;
-  hsa_executable_symbol_t kernel_symbol;
-  hsa_executable_t executable;
-  status = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, NULL, &executable);
-  assert(status == HSA_STATUS_SUCCESS);
-  status = hsa_executable_load_code_object(executable, gpuAgent, hmod, NULL);
-  assert(status == HSA_STATUS_SUCCESS);
-  status = hsa_executable_freeze(executable, NULL);
-  assert(status == HSA_STATUS_SUCCESS);
-  status = hsa_executable_get_symbol(executable, NULL, name, gpuAgent, 0, &kernel_symbol);
-  assert(status == HSA_STATUS_SUCCESS);
-  status = hsa_executable_symbol_get_info(kernel_symbol,
-                    HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
-                    func);
-  assert(status == HSA_STATUS_SUCCESS);
-  return hipSuccess;
-}
-
-void hsaInit(){
-  hsa_status_t status;
-  uint32_t queue_size;
-  status = hsa_iterate_agents(findGpu, NULL);
-  assert(status == HSA_STATUS_SUCCESS);
-  status = hsa_agent_iterate_regions(gpuAgent, FindRegions, NULL);
-  assert(status == HSA_STATUS_SUCCESS);
-  char agent_name[64];
-  status = hsa_agent_get_info(gpuAgent, HSA_AGENT_INFO_NAME, agent_name);
-  assert(status == HSA_STATUS_SUCCESS);
-  status = hsa_agent_get_info(gpuAgent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size);
-  assert(status == HSA_STATUS_SUCCESS);
-  status = hsa_queue_create(gpuAgent, queue_size, HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT32_MAX, UINT32_MAX, &Queue);
-  assert(status == HSA_STATUS_SUCCESS);
-  status = hsa_signal_create(1, 0, NULL, &signal);
-  assert(status == HSA_STATUS_SUCCESS);
-  std::cout<<agent_name<<std::endl;
-}
-
-hipError_t hipDrvLaunchKernel(hipfunction f,
-  uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ,
-  uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
-  uint32_t sharedMemBytes, hipStream_t hStream,
-  void **kernelParams, uint32_t size, void **extra){
-    void *kernarg;
-    std::vector<void*>Args;
-    void ***newP = (void***)kernelParams;
-    for(uint32_t i=0;i<size/sizeof(void*);i++){
-      Args.push_back((void*)*newP[i]);
-    }
-
-    hsa_status_t status = hsa_memory_allocate(kernArgRegion, size, &kernarg);
-    memcpy(kernarg, Args.data(), size);
-    const uint32_t queue_mask = Queue->size -1;
-    uint32_t packet_index = hsa_queue_load_write_index_relaxed(Queue);
-    hsa_kernel_dispatch_packet_t *dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(Queue->base_address))[packet_index & queue_mask]);
-    dispatch_packet->completion_signal = signal;
-    dispatch_packet->workgroup_size_x = blockDimX;
-    dispatch_packet->workgroup_size_y = blockDimY;
-    dispatch_packet->workgroup_size_z = blockDimZ;
-    dispatch_packet->grid_size_x = blockDimX * gridDimX;
-    dispatch_packet->grid_size_y = blockDimY * gridDimY;
-    dispatch_packet->grid_size_z = blockDimZ * gridDimZ;
-    dispatch_packet->group_segment_size = 0;
-    dispatch_packet->private_segment_size = sharedMemBytes;
-    dispatch_packet->kernarg_address = kernarg;
-    dispatch_packet->kernel_object = (uint64_t)f;
-    uint16_t header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
-    (1 << HSA_PACKET_HEADER_BARRIER) |
-    (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
-    (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
-    uint16_t setup = 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
-    uint32_t header32 = header | (setup << 16);
-    __atomic_store_n((uint32_t*)(dispatch_packet), header32, __ATOMIC_RELEASE);
-    hsa_queue_store_write_index_relaxed(Queue, packet_index+1);
-    hsa_signal_store_relaxed(Queue->doorbell_signal, packet_index);
-    hsa_signal_value_t value = hsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
-    return hipSuccess;
-}
-
 #define fileName "vcpy_isa.co"
 #define kernel_name "hello_world"
 
@@ -164,36 +37,8 @@ __global__ void Cpy(hipLaunchParm lp, float *Ad, float* Bd){
   Bd[tx] = Ad[tx];
 }
 
-amd_kernel_code_t* getAkc(uint64_t handle){
-    bool ext_supported = false;
-    hsa_status_t status = hsa_system_extension_supported(
-                HSA_EXTENSION_AMD_LOADER, 1, 0, &ext_supported);
-    assert(HSA_STATUS_SUCCESS == status);
-    assert(true == ext_supported);
-    hsa_ven_amd_loader_1_00_pfn_t ext_table = {nullptr};
-    status = hsa_system_get_extension_table(
-        HSA_EXTENSION_AMD_LOADER, 1, 0, &ext_table);
-    assert(HSA_STATUS_SUCCESS == status);
-    assert(nullptr != ext_table.hsa_ven_amd_loader_query_host_address);
-    std::cout<<"Start"<<std::endl;
-    const void *akc = nullptr;//new amd_kernel_code_t;
-    std::cout<<"End"<<std::endl;
-    status = ext_table.hsa_ven_amd_loader_query_host_address(
-      reinterpret_cast<void*>(handle), &akc);
-
-    if(HSA_STATUS_SUCCESS != status){
-        akc = reinterpret_cast<void*>(handle);
-    }
-
-    assert(nullptr!=akc);
-    amd_kernel_code_t *Akc = (amd_kernel_code_t*)akc;
-    std::cout<<Akc->kernarg_segment_byte_size<<std::endl;
-    return nullptr;
-}
-
 int main(){
   float *A, *B, *Ad, *Bd;
-  hipDevicePtr Aptr, Bptr;
   A = new float[LEN];
   B = new float[LEN];
 
@@ -205,9 +50,6 @@ int main(){
 
   hipMalloc((void**)&Ad, SIZE);
   hipMalloc((void**)&Bd, SIZE);
-  Aptr = reinterpret_cast<unsigned long>(Ad);
-  Bptr = reinterpret_cast<unsigned long>(Bd);
-  hsaInit();
 
   hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
   hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
@@ -219,16 +61,6 @@ int main(){
   hipStreamCreate(&stream);
   void *args[2] = {&Ad, &Bd};
 
-/*  struct __attribute__((aligned(16))) args_t{
-    void *Aptr;
-    void *Bptr;
-  } args;
-  args.Aptr = Ad;
-  args.Bptr = Bd;
-*/
-//  hipDrvLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, (void**)&args, sizeof(args), 0);
-
-    amd_kernel_code_t *akc = getAkc(Function.kernel);
 
     std::vector<void*>argBuffer(2);
     memcpy(&argBuffer[0], &Ad, sizeof(void*));
@@ -246,7 +78,6 @@ int main(){
 
     hipStreamDestroy(stream);
 
-//  hipLaunchKernel(Cpy, dim3(1), dim3(LEN), 0, 0, Ad, Bd);
  hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
 
   for(uint32_t i=0;i<LEN;i++){
diff --git a/tests/src/hipModuleUnload.cpp b/tests/src/hipModuleUnload.cpp
new file mode 100644
index 0000000000..fd23a4717e
--- /dev/null
+++ b/tests/src/hipModuleUnload.cpp
@@ -0,0 +1,31 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include<hip/hip_runtime.h>
+#include<hip/hip_runtime_api.h>
+#include<iostream>
+
+#define fileName "vcpy_isa.co"
+
+int main(){
+    hipModule module;
+    hipModuleLoad(&module, fileName);
+    hipModuleUnload(module);
+}
+

From cb996c7b7a232a0c3ada69b7470e27adc89df9b8 Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Wed, 24 Aug 2016 09:47:11 -0500
Subject: [PATCH 10/56] changed internal structure of hipFunction and hipModule

Change-Id: Ifa343782e29d7e056efc47e56253311013005093
---
 include/hcc_detail/hip_hcc.h         | 14 +++++++++
 include/hcc_detail/hip_runtime_api.h | 10 ++-----
 src/hip_module.cpp                   | 44 ++++++++++------------------
 3 files changed, 32 insertions(+), 36 deletions(-)

diff --git a/include/hcc_detail/hip_hcc.h b/include/hcc_detail/hip_hcc.h
index 516c62a79b..25e4c56a48 100644
--- a/include/hcc_detail/hip_hcc.h
+++ b/include/hcc_detail/hip_hcc.h
@@ -396,6 +396,20 @@ public:
 typedef ihipStreamCriticalBase_t<StreamMutex> ihipStreamCritical_t;
 typedef LockedAccessor<ihipStreamCritical_t> LockedAccessor_StreamCrit_t;
 
+class ihipModule_t{
+public:
+  hsa_executable_t executable;
+  hsa_code_object_t object;
+  std::string fileName;
+};
+
+
+class ihipFunction_t{
+public:
+  hsa_executable_symbol_t kernel_symbol;
+  uint64_t kernel;
+};
+
 // Internal stream structure.
 class ihipStream_t {
 public:
diff --git a/include/hcc_detail/hip_runtime_api.h b/include/hcc_detail/hip_runtime_api.h
index 7afd499041..a44d1a299c 100644
--- a/include/hcc_detail/hip_runtime_api.h
+++ b/include/hcc_detail/hip_runtime_api.h
@@ -52,15 +52,9 @@ typedef struct ihipDevice_t *hipDevice_t;
 
 typedef struct ihipStream_t *hipStream_t;
 
-typedef struct {
-uint64_t kernel;
-uint64_t symbol;
-}hipFunction;
+typedef struct ihipModule_t *hipModule;
 
-typedef struct{
-uint64_t executable;
-uint64_t object;
-} hipModule;
+typedef struct ihipFunction_t *hipFunction;
 
 typedef struct hipEvent_t {
     struct ihipEvent_t *_handle;
diff --git a/src/hip_module.cpp b/src/hip_module.cpp
index 6227ce750f..2fcfdc703a 100644
--- a/src/hip_module.cpp
+++ b/src/hip_module.cpp
@@ -56,6 +56,7 @@ namespace hipdrv{
 hipError_t hipModuleLoad(hipModule *module, const char *fname){
     HIP_INIT_API(fname);
     hipError_t ret = hipSuccess;
+    *module = new ihipModule_t;
 
     if(module == NULL){
         return hipErrorInvalidValue;
@@ -74,6 +75,8 @@ hipError_t hipModuleLoad(hipModule *module, const char *fname){
             return hipErrorFileNotFound;
 
         }else{
+
+            *module = new ihipModule_t;
             size_t size = std::string::size_type(in.tellg());
             void *p = NULL;
             hsa_agent_t agent = currentDevice->_hsaAgent;
@@ -90,26 +93,19 @@ hipError_t hipModuleLoad(hipModule *module, const char *fname){
                 return hipErrorOutOfMemory;
             }
 
-            hsa_code_object_t obj;
             in.seekg(0, std::ios::beg);
             std::copy(std::istreambuf_iterator<char>(in),
                       std::istreambuf_iterator<char>(), ptr);
-            status = hsa_code_object_deserialize(ptr, size, NULL, &obj);
+            status = hsa_code_object_deserialize(ptr, size, NULL, &(*module)->object);
 
             if(status != HSA_STATUS_SUCCESS){
                 return hipErrorSharedObjectInitFailed;
             }
 
-            module->object = obj.handle;
-
-            hsa_executable_t executable;
-            status = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, NULL, &executable);
+            status = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, NULL, &(*module)->executable);
             if(status != HSA_STATUS_SUCCESS){
                 return hipErrorNotInitialized;
             }
-
-            module->executable = executable.handle;
-
         }
     }
 
@@ -117,15 +113,12 @@ hipError_t hipModuleLoad(hipModule *module, const char *fname){
 }
 
 hipError_t hipModuleUnload(hipModule hmod){
-    hsa_executable_t exec;
-    hsa_code_object_t co;
     hipError_t ret = hipSuccess;
-    exec.handle = hmod.executable;
-    co.handle = hmod.object;
-    hsa_status_t status = hsa_executable_destroy(exec);
+    hsa_status_t status = hsa_executable_destroy(hmod->executable);
     if(status != HSA_STATUS_SUCCESS){ret = hipErrorInvalidValue; }
-    status = hsa_code_object_destroy(co);
+    status = hsa_code_object_destroy(hmod->object);
     if(status != HSA_STATUS_SUCCESS){ret = hipErrorInvalidValue; }
+    delete hmod;
     return ret;
 }
 
@@ -142,40 +135,35 @@ hipError_t hipModuleGetFunction(hipFunction *func, hipModule hmod, const char *n
         ret = hipErrorInvalidContext;
 
     }else{
+        *func = new ihipFunction_t;
         int deviceId = ctx->getDevice()->_deviceId;
         ihipDevice_t *currentDevice = ihipGetDevice(deviceId);
         hsa_agent_t gpuAgent = (hsa_agent_t)currentDevice->_hsaAgent;
 
         hsa_status_t status;
-        hsa_executable_t executable;
-        hsa_executable_symbol_t kernel_symbol;
-        executable.handle = hmod.executable;
         if(status != HSA_STATUS_SUCCESS){
             return hipErrorNotInitialized;
         }
 
-        hsa_code_object_t obj;
-        obj.handle = hmod.object;
-        status = hsa_executable_load_code_object(executable, gpuAgent, obj, NULL);
+        status = hsa_executable_load_code_object(hmod->executable, gpuAgent, hmod->object, NULL);
         if(status != HSA_STATUS_SUCCESS){
             return hipErrorNotInitialized;
         }
 
-        status = hsa_executable_freeze(executable, NULL);
-        status = hsa_executable_get_symbol(executable, NULL, name, gpuAgent, 0, &kernel_symbol);
+        status = hsa_executable_freeze(hmod->executable, NULL);
+        status = hsa_executable_get_symbol(hmod->executable, NULL, name, gpuAgent, 0, &(*func)->kernel_symbol);
         if(status != HSA_STATUS_SUCCESS){
             return hipErrorNotFound;
         }
 
-        status = hsa_executable_symbol_get_info(kernel_symbol,
+        status = hsa_executable_symbol_get_info((*func)->kernel_symbol,
                                    HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
-                                   &func->kernel);
+                                   &(*func)->kernel);
 
         if(status != HSA_STATUS_SUCCESS){
             return hipErrorNotFound;
         }
 
-        func->symbol = kernel_symbol.handle;
 
     }
 
@@ -187,7 +175,7 @@ hipError_t hipLaunchModuleKernel(hipFunction f,
             uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
             uint32_t sharedMemBytes, hipStream_t hStream,
             void **kernelParams, void **extra){
-    HIP_INIT_API(f.kernel);
+    HIP_INIT_API(f->kernel);
     auto ctx = ihipGetTlsDefaultCtx();
     hipError_t ret = hipSuccess;
 
@@ -230,7 +218,7 @@ Kernel argument preparation.
   Launch AQL packet
 */
         hStream->launchModuleKernel(signal, blockDimX, blockDimY, blockDimZ,
-                  gridDimX, gridDimY, gridDimZ, sharedMemBytes, config[1], kernSize, f.kernel);
+                  gridDimX, gridDimY, gridDimZ, sharedMemBytes, config[1], kernSize, f->kernel);
 
 /*
   Wait for signal

From 0f4974dbcbfa229849a3db2d369ba87a190580d0 Mon Sep 17 00:00:00 2001
From: Evgeny Mankov <Evgeniy.Mankov@amd.com>
Date: Wed, 24 Aug 2016 18:51:36 +0300
Subject: [PATCH 11/56] clang-hipify: code refactoring and performance
 improvement

---
 clang-hipify/src/Cuda2Hip.cpp | 482 +++++++++++++++++++---------------
 1 file changed, 268 insertions(+), 214 deletions(-)

diff --git a/clang-hipify/src/Cuda2Hip.cpp b/clang-hipify/src/Cuda2Hip.cpp
index 45960ac8ca..b07b21dfa3 100644
--- a/clang-hipify/src/Cuda2Hip.cpp
+++ b/clang-hipify/src/Cuda2Hip.cpp
@@ -1206,17 +1206,10 @@ private:
 };
 
 class Cuda2HipCallback : public MatchFinder::MatchCallback {
-public:
-  Cuda2HipCallback(Replacements *Replace, ast_matchers::MatchFinder *parent, HipifyPPCallbacks *PPCallbacks)
-    : Replace(Replace), owner(parent), PP(PPCallbacks) {
-    PP->setMatch(this);
-  }
-
-  void convertKernelDecl(const FunctionDecl *kernelDecl,
-                         const MatchFinder::MatchResult &Result) {
+private:
+  void convertKernelDecl(const FunctionDecl *kernelDecl, const MatchFinder::MatchResult &Result) {
     SourceManager *SM = Result.SourceManager;
     LangOptions DefaultLangOptions;
-
     SmallString<40> XStr;
     raw_svector_ostream OS(XStr);
     StringRef initialParamList;
@@ -1224,47 +1217,44 @@ public:
     size_t replacementLength = OS.str().size();
     SourceLocation sl = kernelDecl->getNameInfo().getEndLoc();
     SourceLocation kernelArgListStart = Lexer::findLocationAfterToken(
-        sl, tok::l_paren, *SM, DefaultLangOptions, true);
+      sl, tok::l_paren, *SM, DefaultLangOptions, true);
     DEBUG(dbgs() << kernelArgListStart.printToString(*SM));
     if (kernelDecl->getNumParams() > 0) {
       const ParmVarDecl *pvdFirst = kernelDecl->getParamDecl(0);
       const ParmVarDecl *pvdLast =
-          kernelDecl->getParamDecl(kernelDecl->getNumParams() - 1);
+        kernelDecl->getParamDecl(kernelDecl->getNumParams() - 1);
       SourceLocation kernelArgListStart(pvdFirst->getLocStart());
       SourceLocation kernelArgListEnd(pvdLast->getLocEnd());
       SourceLocation stop = Lexer::getLocForEndOfToken(
-          kernelArgListEnd, 0, *SM, DefaultLangOptions);
+        kernelArgListEnd, 0, *SM, DefaultLangOptions);
       replacementLength +=
-          SM->getCharacterData(stop) - SM->getCharacterData(kernelArgListStart);
+        SM->getCharacterData(stop) - SM->getCharacterData(kernelArgListStart);
       initialParamList = StringRef(SM->getCharacterData(kernelArgListStart),
-                                   replacementLength);
+        replacementLength);
       OS << ", " << initialParamList;
     }
     DEBUG(dbgs() << "initial paramlist: " << initialParamList << "\n"
-                 << "new paramlist: " << OS.str() << "\n");
+      << "new paramlist: " << OS.str() << "\n");
     Replacement Rep0(*(Result.SourceManager), kernelArgListStart,
-                     replacementLength, OS.str());
+      replacementLength, OS.str());
     Replace->insert(Rep0);
   }
 
-  void run(const MatchFinder::MatchResult &Result) override {
-    SourceManager *SM = Result.SourceManager;
-    LangOptions DefaultLangOptions;
-
-    if (const CallExpr *call =
-            Result.Nodes.getNodeAs<CallExpr>("cudaCall")) {
+  bool cudaCall(const MatchFinder::MatchResult &Result) {
+    if (const CallExpr *call = Result.Nodes.getNodeAs<CallExpr>("cudaCall")) {
       const FunctionDecl *funcDcl = call->getDirectCallee();
       StringRef name = funcDcl->getDeclName().getAsString();
       const auto found = N.cuda2hipRename.find(name);
       if (found != N.cuda2hipRename.end()) {
+        SourceManager *SM = Result.SourceManager;
         StringRef repName = found->second.hipName;
         SourceLocation sl = call->getLocStart();
         size_t length = name.size();
         bool bReplace = true;
         if (SM->isMacroArgExpansion(sl)) {
           sl = SM->getImmediateSpellingLoc(sl);
-        }
-        else if (SM->isMacroBodyExpansion(sl)) {
+        } else if (SM->isMacroBodyExpansion(sl)) {
+          LangOptions DefaultLangOptions;
           SourceLocation sl_macro = SM->getExpansionLoc(sl);
           SourceLocation sl_end = Lexer::getLocForEndOfToken(sl_macro, 0, *SM, DefaultLangOptions);
           length = SM->getCharacterData(sl_end) - SM->getCharacterData(sl_macro);
@@ -1281,10 +1271,13 @@ public:
           Replace->insert(Rep);
         }
       }
+      return true;
     }
+    return false;
+  }
 
-    if (const CUDAKernelCallExpr *launchKernel =
-            Result.Nodes.getNodeAs<CUDAKernelCallExpr>("cudaLaunchKernel")) {
+  bool cudaLaunchKernel(const MatchFinder::MatchResult &Result) {
+    if (const CUDAKernelCallExpr *launchKernel = Result.Nodes.getNodeAs<CUDAKernelCallExpr>("cudaLaunchKernel")) {
       SmallString<40> XStr;
       raw_svector_ostream OS(XStr);
       StringRef calleeName;
@@ -1295,78 +1288,71 @@ public:
       } else {
         const Expr *e = launchKernel->getCallee();
         if (const UnresolvedLookupExpr *ule =
-                dyn_cast<UnresolvedLookupExpr>(e)) {
+          dyn_cast<UnresolvedLookupExpr>(e)) {
           calleeName = ule->getName().getAsIdentifierInfo()->getName();
           owner->addMatcher(functionTemplateDecl(hasName(calleeName))
-                                .bind("unresolvedTemplateName"),
-                            this);
+            .bind("unresolvedTemplateName"),
+            this);
         }
       }
-
       XStr.clear();
       OS << "hipLaunchKernel(HIP_KERNEL_NAME(" << calleeName << "),";
-
       const CallExpr *config = launchKernel->getConfig();
-      DEBUG(dbgs() << "Kernel config arguments:"
-                   << "\n");
+      DEBUG(dbgs() << "Kernel config arguments:" << "\n");
+      SourceManager *SM = Result.SourceManager;
+      LangOptions DefaultLangOptions;
       for (unsigned argno = 0; argno < config->getNumArgs(); argno++) {
         const Expr *arg = config->getArg(argno);
         if (!isa<CXXDefaultArgExpr>(arg)) {
-          const ParmVarDecl *pvd =
-              config->getDirectCallee()->getParamDecl(argno);
-
+          const ParmVarDecl *pvd = config->getDirectCallee()->getParamDecl(argno);
           SourceLocation sl(arg->getLocStart());
           SourceLocation el(arg->getLocEnd());
-          SourceLocation stop =
-              Lexer::getLocForEndOfToken(el, 0, *SM, DefaultLangOptions);
+          SourceLocation stop = Lexer::getLocForEndOfToken(el, 0, *SM, DefaultLangOptions);
           StringRef outs(SM->getCharacterData(sl),
-                         SM->getCharacterData(stop) - SM->getCharacterData(sl));
+            SM->getCharacterData(stop) - SM->getCharacterData(sl));
           DEBUG(dbgs() << "args[ " << argno << "]" << outs << " <"
-                       << pvd->getType().getAsString() << ">"
-                       << "\n");
-          if (pvd->getType().getAsString().compare("dim3") == 0)
+            << pvd->getType().getAsString() << ">"
+            << "\n");
+          if (pvd->getType().getAsString().compare("dim3") == 0) {
             OS << " dim3(" << outs << "),";
-          else
+          } else {
             OS << " " << outs << ",";
-        } else
+          }
+        } else {
           OS << " 0,";
+        }
       }
-
       for (unsigned argno = 0; argno < launchKernel->getNumArgs(); argno++) {
         const Expr *arg = launchKernel->getArg(argno);
         SourceLocation sl(arg->getLocStart());
         SourceLocation el(arg->getLocEnd());
         SourceLocation stop =
-            Lexer::getLocForEndOfToken(el, 0, *SM, DefaultLangOptions);
+          Lexer::getLocForEndOfToken(el, 0, *SM, DefaultLangOptions);
         std::string outs(SM->getCharacterData(sl),
-                         SM->getCharacterData(stop) - SM->getCharacterData(sl));
+          SM->getCharacterData(stop) - SM->getCharacterData(sl));
         DEBUG(dbgs() << outs << "\n");
         OS << " " << outs << ",";
       }
       XStr.pop_back();
       OS << ")";
       size_t length =
-          SM->getCharacterData(Lexer::getLocForEndOfToken(
-              launchKernel->getLocEnd(), 0, *SM, DefaultLangOptions)) -
-          SM->getCharacterData(launchKernel->getLocStart());
+        SM->getCharacterData(Lexer::getLocForEndOfToken(
+        launchKernel->getLocEnd(), 0, *SM, DefaultLangOptions)) -
+        SM->getCharacterData(launchKernel->getLocStart());
       Replacement Rep(*SM, launchKernel->getLocStart(), length, OS.str());
       Replace->insert(Rep);
       countReps[ConvTypes::CONV_KERN]++;
+      return true;
     }
+    return false;
+  }
 
-    if (const FunctionTemplateDecl *templateDecl =
-            Result.Nodes.getNodeAs<FunctionTemplateDecl>(
-                "unresolvedTemplateName")) {
-      FunctionDecl *kernelDecl = templateDecl->getTemplatedDecl();
-      convertKernelDecl(kernelDecl, Result);
-    }
-
-    if (const MemberExpr *threadIdx =
-            Result.Nodes.getNodeAs<MemberExpr>("cudaBuiltin")) {
+  bool cudaBuiltin(const MatchFinder::MatchResult &Result) {
+    if (const MemberExpr *threadIdx = Result.Nodes.getNodeAs<MemberExpr>("cudaBuiltin")) {
       if (const OpaqueValueExpr *refBase =
-              dyn_cast<OpaqueValueExpr>(threadIdx->getBase())) {
+        dyn_cast<OpaqueValueExpr>(threadIdx->getBase())) {
         if (const DeclRefExpr *declRef =
-                dyn_cast<DeclRefExpr>(refBase->getSourceExpr())) {
+          dyn_cast<DeclRefExpr>(refBase->getSourceExpr())) {
           StringRef name = declRef->getDecl()->getName();
           StringRef memberName = threadIdx->getMemberDecl()->getName();
           size_t pos = memberName.find_first_not_of("__fetch_builtin_");
@@ -1378,48 +1364,60 @@ public:
             countReps[found->second.countType]++;
             StringRef repName = found->second.hipName;
             SourceLocation sl = threadIdx->getLocStart();
+            SourceManager *SM = Result.SourceManager;
             Replacement Rep(*SM, sl, name.size(), repName);
             Replace->insert(Rep);
           }
         }
       }
+      return true;
     }
+    return false;
+  }
 
-    if (const DeclRefExpr *cudaEnumConstantRef =
-            Result.Nodes.getNodeAs<DeclRefExpr>("cudaEnumConstantRef")) {
-      StringRef name = cudaEnumConstantRef->getDecl()->getNameAsString();
+  bool cudaEnumConstantRef(const MatchFinder::MatchResult &Result) {
+    if (const DeclRefExpr *enumConstantRef = Result.Nodes.getNodeAs<DeclRefExpr>("cudaEnumConstantRef")) {
+      StringRef name = enumConstantRef->getDecl()->getNameAsString();
       const auto found = N.cuda2hipRename.find(name);
       if (found != N.cuda2hipRename.end()) {
         countReps[found->second.countType]++;
         StringRef repName = found->second.hipName;
-        SourceLocation sl = cudaEnumConstantRef->getLocStart();
+        SourceLocation sl = enumConstantRef->getLocStart();
+        SourceManager *SM = Result.SourceManager;
         Replacement Rep(*SM, sl, name.size(), repName);
         Replace->insert(Rep);
       }
+      return true;
     }
+    return false;
+  }
 
-    if (const VarDecl *cudaEnumConstantDecl =
-            Result.Nodes.getNodeAs<VarDecl>("cudaEnumConstantDecl")) {
+  bool cudaEnumConstantDecl(const MatchFinder::MatchResult &Result) {
+    if (const VarDecl *enumConstantDecl = Result.Nodes.getNodeAs<VarDecl>("cudaEnumConstantDecl")) {
       StringRef name =
-          cudaEnumConstantDecl->getType()->getAsTagDecl()->getNameAsString();
+        enumConstantDecl->getType()->getAsTagDecl()->getNameAsString();
       // anonymous typedef enum
       if (name.empty()) {
-        QualType QT = cudaEnumConstantDecl->getType().getUnqualifiedType();
+        QualType QT = enumConstantDecl->getType().getUnqualifiedType();
         name = QT.getAsString();
       }
       const auto found = N.cuda2hipRename.find(name);
       if (found != N.cuda2hipRename.end()) {
         countReps[found->second.countType]++;
         StringRef repName = found->second.hipName;
-        SourceLocation sl = cudaEnumConstantDecl->getLocStart();
+        SourceLocation sl = enumConstantDecl->getLocStart();
+        SourceManager *SM = Result.SourceManager;
         Replacement Rep(*SM, sl, name.size(), repName);
         Replace->insert(Rep);
       }
+      return true;
     }
+    return false;
+  }
 
-    if (const VarDecl *cudaTypedefVar =
-      Result.Nodes.getNodeAs<VarDecl>("cudaTypedefVar")) {
-      QualType QT = cudaTypedefVar->getType();
+  bool cudaTypedefVar(const MatchFinder::MatchResult &Result) {
+    if (const VarDecl *typedefVar = Result.Nodes.getNodeAs<VarDecl>("cudaTypedefVar")) {
+      QualType QT = typedefVar->getType();
       if (QT->isArrayType()) {
         QT = QT.getTypePtr()->getAsArrayTypeUnsafe()->getElementType();
       }
@@ -1429,31 +1427,81 @@ public:
       if (found != N.cuda2hipRename.end()) {
         countReps[found->second.countType]++;
         StringRef repName = found->second.hipName;
-        SourceLocation sl = cudaTypedefVar->getLocStart();
+        SourceLocation sl = typedefVar->getLocStart();
+        SourceManager *SM = Result.SourceManager;
         Replacement Rep(*SM, sl, name.size(), repName);
         Replace->insert(Rep);
       }
+      return true;
     }
+    return false;
+  }
 
-    if (const VarDecl *cudaStructVar =
-            Result.Nodes.getNodeAs<VarDecl>("cudaStructVar")) {
-      StringRef name = cudaStructVar->getType()
-                           ->getAsStructureType()
-                           ->getDecl()
-                           ->getNameAsString();
+  bool cudaStructVar(const MatchFinder::MatchResult &Result) {
+    if (const VarDecl *structVar = Result.Nodes.getNodeAs<VarDecl>("cudaStructVar")) {
+      StringRef name = structVar->getType()
+        ->getAsStructureType()
+        ->getDecl()
+        ->getNameAsString();
       const auto found = N.cuda2hipRename.find(name);
       if (found != N.cuda2hipRename.end()) {
         countReps[found->second.countType]++;
         StringRef repName = found->second.hipName;
-        TypeLoc TL = cudaStructVar->getTypeSourceInfo()->getTypeLoc();
+        TypeLoc TL = structVar->getTypeSourceInfo()->getTypeLoc();
         SourceLocation sl = TL.getUnqualifiedLoc().getLocStart();
+        SourceManager *SM = Result.SourceManager;
         Replacement Rep(*SM, sl, name.size(), repName);
         Replace->insert(Rep);
       }
+      return true;
     }
+    return false;
+  }
 
-    if (const VarDecl *sharedVar =
-      Result.Nodes.getNodeAs<VarDecl>("cudaSharedIncompleteArrayVar")) {
+  bool cudaStructVarPtr(const MatchFinder::MatchResult &Result) {
+    if (const VarDecl *structVarPtr = Result.Nodes.getNodeAs<VarDecl>("cudaStructVarPtr")) {
+      const Type *t = structVarPtr->getType().getTypePtrOrNull();
+      if (t) {
+        StringRef name = t->getPointeeCXXRecordDecl()->getName();
+        const auto found = N.cuda2hipRename.find(name);
+        if (found != N.cuda2hipRename.end()) {
+          countReps[found->second.countType]++;
+          StringRef repName = found->second.hipName;
+          TypeLoc TL = structVarPtr->getTypeSourceInfo()->getTypeLoc();
+          SourceLocation sl = TL.getUnqualifiedLoc().getLocStart();
+          SourceManager *SM = Result.SourceManager;
+          Replacement Rep(*SM, sl, name.size(), repName);
+          Replace->insert(Rep);
+        }
+      }
+      return true;
+    }
+    return false;
+  }
+
+  bool cudaStructSizeOf(const MatchFinder::MatchResult &Result) {
+    if (const UnaryExprOrTypeTraitExpr *expr = Result.Nodes.getNodeAs<UnaryExprOrTypeTraitExpr>("cudaStructSizeOf")) {
+      TypeSourceInfo *typeInfo = expr->getArgumentTypeInfo();
+      QualType QT = typeInfo->getType().getUnqualifiedType();
+      const Type *type = QT.getTypePtr();
+      StringRef name = type->getAsCXXRecordDecl()->getName();
+      const auto found = N.cuda2hipRename.find(name);
+      if (found != N.cuda2hipRename.end()) {
+        countReps[found->second.countType]++;
+        StringRef repName = found->second.hipName;
+        TypeLoc TL = typeInfo->getTypeLoc();
+        SourceLocation sl = TL.getUnqualifiedLoc().getLocStart();
+        SourceManager *SM = Result.SourceManager;
+        Replacement Rep(*SM, sl, name.size(), repName);
+        Replace->insert(Rep);
+      }
+      return true;
+    }
+    return false;
+  }
+
+  bool cudaSharedIncompleteArrayVar(const MatchFinder::MatchResult &Result) {
+    if (const VarDecl *sharedVar = Result.Nodes.getNodeAs<VarDecl>("cudaSharedIncompleteArrayVar")) {
       // Example: extern __shared__ uint sRadix1[];
       if (sharedVar->hasExternalFormalLinkage()) {
         QualType QT = sharedVar->getType();
@@ -1477,6 +1525,7 @@ public:
         if (!typeName.empty()) {
           SourceLocation slStart = sharedVar->getLocStart();
           SourceLocation slEnd = sharedVar->getLocEnd();
+          SourceManager *SM = Result.SourceManager;
           size_t repLength = SM->getCharacterData(slEnd) - SM->getCharacterData(slStart) + 1;
           SmallString<128> tmpData;
           StringRef varName = sharedVar->getNameAsString();
@@ -1486,28 +1535,14 @@ public:
           countReps[CONV_MEM]++;
         }
       }
+      return true;
     }
+    return false;
+  }
 
-    if (const VarDecl *cudaStructVarPtr =
-            Result.Nodes.getNodeAs<VarDecl>("cudaStructVarPtr")) {
-      const Type *t = cudaStructVarPtr->getType().getTypePtrOrNull();
-      if (t) {
-        StringRef name = t->getPointeeCXXRecordDecl()->getName();
-        const auto found = N.cuda2hipRename.find(name);
-        if (found != N.cuda2hipRename.end()) {
-          countReps[found->second.countType]++;
-          StringRef repName = found->second.hipName;
-          TypeLoc TL = cudaStructVarPtr->getTypeSourceInfo()->getTypeLoc();
-          SourceLocation sl = TL.getUnqualifiedLoc().getLocStart();
-          Replacement Rep(*SM, sl, name.size(), repName);
-          Replace->insert(Rep);
-        }
-      }
-    }
-
-    if (const ParmVarDecl *cudaParamDecl =
-            Result.Nodes.getNodeAs<ParmVarDecl>("cudaParamDecl")) {
-      QualType QT = cudaParamDecl->getOriginalType().getUnqualifiedType();
+  bool cudaParamDecl(const MatchFinder::MatchResult &Result) {
+    if (const ParmVarDecl *paramDecl = Result.Nodes.getNodeAs<ParmVarDecl>("cudaParamDecl")) {
+      QualType QT = paramDecl->getOriginalType().getUnqualifiedType();
       StringRef name = QT.getAsString();
       const Type *t = QT.getTypePtr();
       if (t->isStructureOrClassType()) {
@@ -1517,64 +1552,91 @@ public:
       if (found != N.cuda2hipRename.end()) {
         countReps[found->second.countType]++;
         StringRef repName = found->second.hipName;
-        TypeLoc TL = cudaParamDecl->getTypeSourceInfo()->getTypeLoc();
+        TypeLoc TL = paramDecl->getTypeSourceInfo()->getTypeLoc();
         SourceLocation sl = TL.getUnqualifiedLoc().getLocStart();
+        SourceManager *SM = Result.SourceManager;
         Replacement Rep(*SM, sl, name.size(), repName);
         Replace->insert(Rep);
       }
+      return true;
     }
+    return false;
+  }
 
-    if (const ParmVarDecl *cudaParamDeclPtr =
-            Result.Nodes.getNodeAs<ParmVarDecl>("cudaParamDeclPtr")) {
-      const Type *pt = cudaParamDeclPtr->getType().getTypePtrOrNull();
+  bool cudaParamDeclPtr(const MatchFinder::MatchResult &Result) {
+    if (const ParmVarDecl *paramDeclPtr = Result.Nodes.getNodeAs<ParmVarDecl>("cudaParamDeclPtr")) {
+      const Type *pt = paramDeclPtr->getType().getTypePtrOrNull();
       if (pt) {
         QualType QT = pt->getPointeeType();
         const Type *t = QT.getTypePtr();
         StringRef name = t->isStructureOrClassType()
-                             ? t->getAsCXXRecordDecl()->getName()
-                             : StringRef(QT.getAsString());
+          ? t->getAsCXXRecordDecl()->getName()
+          : StringRef(QT.getAsString());
         const auto found = N.cuda2hipRename.find(name);
         if (found != N.cuda2hipRename.end()) {
           countReps[found->second.countType]++;
           StringRef repName = found->second.hipName;
-          TypeLoc TL = cudaParamDeclPtr->getTypeSourceInfo()->getTypeLoc();
+          TypeLoc TL = paramDeclPtr->getTypeSourceInfo()->getTypeLoc();
           SourceLocation sl = TL.getUnqualifiedLoc().getLocStart();
+          SourceManager *SM = Result.SourceManager;
           Replacement Rep(*SM, sl, name.size(), repName);
           Replace->insert(Rep);
         }
       }
+      return true;
     }
+    return false;
+  }
 
-    if (const StringLiteral *stringLiteral =
-            Result.Nodes.getNodeAs<StringLiteral>("stringLiteral")) {
-      if (stringLiteral->getCharByteWidth() == 1) {
-        StringRef s = stringLiteral->getString();
-        processString(s, N, Replace, *SM, stringLiteral->getLocStart(),
-                      countReps);
+  bool unresolvedTemplateName(const MatchFinder::MatchResult &Result) {
+    if (const FunctionTemplateDecl *templateDecl = Result.Nodes.getNodeAs<FunctionTemplateDecl>("unresolvedTemplateName")) {
+      FunctionDecl *kernelDecl = templateDecl->getTemplatedDecl();
+      convertKernelDecl(kernelDecl, Result);
+      return true;
+    }
+    return false;
+  }
+
+  bool stringLiteral(const MatchFinder::MatchResult &Result) {
+    if (const StringLiteral *sLiteral = Result.Nodes.getNodeAs<StringLiteral>("stringLiteral")) {
+      if (sLiteral->getCharByteWidth() == 1) {
+        StringRef s = sLiteral->getString();
+        SourceManager *SM = Result.SourceManager;
+        processString(s, N, Replace, *SM, sLiteral->getLocStart(), countReps);
       }
+      return true;
     }
+    return false;
+  }
 
-    if (const UnaryExprOrTypeTraitExpr *expr =
-            Result.Nodes.getNodeAs<UnaryExprOrTypeTraitExpr>(
-                "cudaStructSizeOf")) {
-      TypeSourceInfo *typeInfo = expr->getArgumentTypeInfo();
-      QualType QT = typeInfo->getType().getUnqualifiedType();
-      const Type *type = QT.getTypePtr();
-      StringRef name = type->getAsCXXRecordDecl()->getName();
-      const auto found = N.cuda2hipRename.find(name);
-      if (found != N.cuda2hipRename.end()) {
-        countReps[found->second.countType]++;
-        StringRef repName = found->second.hipName;
-        TypeLoc TL = typeInfo->getTypeLoc();
-        SourceLocation sl = TL.getUnqualifiedLoc().getLocStart();
-        Replacement Rep(*SM, sl, name.size(), repName);
-        Replace->insert(Rep);
-      }
-    }
+public:
+  Cuda2HipCallback(Replacements *Replace, ast_matchers::MatchFinder *parent, HipifyPPCallbacks *PPCallbacks)
+    : Replace(Replace), owner(parent), PP(PPCallbacks) {
+    PP->setMatch(this);
+  }
 
+  void run(const MatchFinder::MatchResult &Result) override {
+    do {
+      if (cudaCall(Result)) break;
+      if (cudaLaunchKernel(Result)) break;
+      if (cudaBuiltin(Result)) break;
+      if (cudaEnumConstantRef(Result)) break;
+      if (cudaEnumConstantDecl(Result)) break;
+      if (cudaTypedefVar(Result)) break;
+      if (cudaStructVar(Result)) break;
+      if (cudaStructVarPtr(Result)) break;
+      if (cudaStructSizeOf(Result)) break;
+      if (cudaSharedIncompleteArrayVar(Result)) break;
+      if (cudaParamDecl(Result)) break;
+      if (cudaParamDeclPtr(Result)) break;
+      if (stringLiteral(Result)) break;
+      if (unresolvedTemplateName(Result)) break;
+      break;
+    } while (false);
     if (PP->countReps[CONV_INCLUDE_CUDA_MAIN_H] == 0 &&
             countReps[CONV_INCLUDE_CUDA_MAIN_H] == 0 && Replace->size() > 0) {
       StringRef repName = "#include <hip_runtime.h>\n";
+      SourceManager *SM = Result.SourceManager;
       Replacement Rep(*SM, SM->getLocForStartOfFile(SM->getMainFileID()), 0, repName);
       Replace->insert(Rep);
       countReps[CONV_INCLUDE_CUDA_MAIN_H]++;
@@ -1592,7 +1654,7 @@ private:
 
 void HipifyPPCallbacks::handleEndSource() {
   if (Match->countReps[CONV_INCLUDE_CUDA_MAIN_H] == 0 &&
-    countReps[CONV_INCLUDE_CUDA_MAIN_H] == 0 && Replace->size() > 0) {
+             countReps[CONV_INCLUDE_CUDA_MAIN_H] == 0 && Replace->size() > 0) {
     StringRef repName = "#include <hip_runtime.h>\n";
     Replacement Rep(*_sm, _sm->getLocForStartOfFile(_sm->getMainFileID()), 0, repName);
     Replace->insert(Rep);
@@ -1621,18 +1683,75 @@ static cl::opt<bool>
 
 static cl::opt<bool>
     PrintStats("print-stats", cl::desc("print the command-line, like a header"),
-               cl::value_desc("print-stats"));
+    cl::value_desc("print-stats"));
+
+void addAllMatchers(ast_matchers::MatchFinder &Finder, Cuda2HipCallback *Callback) {
+  Finder.addMatcher(callExpr(isExpansionInMainFile(),
+                             callee(functionDecl(matchesName("cuda.*|cublas.*"))))
+                             .bind("cudaCall"),
+                             Callback);
+  Finder.addMatcher(cudaKernelCallExpr().bind("cudaLaunchKernel"), Callback);
+  Finder.addMatcher(memberExpr(isExpansionInMainFile(),
+                               hasObjectExpression(hasType(cxxRecordDecl(
+                               matchesName("__cuda_builtin_")))))
+                               .bind("cudaBuiltin"),
+                               Callback);
+  Finder.addMatcher(declRefExpr(isExpansionInMainFile(),
+                                to(enumConstantDecl(
+                                matchesName("cuda.*|cublas.*|CUDA.*|CUBLAS*"))))
+                                .bind("cudaEnumConstantRef"),
+                                Callback);
+  Finder.addMatcher(varDecl(isExpansionInMainFile(),
+                            hasType(enumDecl()))
+                            .bind("cudaEnumConstantDecl"),
+                            Callback);
+  Finder.addMatcher(varDecl(isExpansionInMainFile(),
+                            hasType(typedefDecl(matchesName("cuda.*|cublas.*"))))
+                            .bind("cudaTypedefVar"),
+                            Callback);
+  // Array of elements of typedef type, Example: cudaStream_t streams[2];
+  Finder.addMatcher(varDecl(isExpansionInMainFile(),
+                            hasType(arrayType(hasElementType(typedefType(
+                            hasDeclaration(typedefDecl(matchesName("cuda.*|cublas.*"))))))))
+                            .bind("cudaTypedefVar"),
+                            Callback);
+  Finder.addMatcher(varDecl(isExpansionInMainFile(),
+                            hasType(cxxRecordDecl(matchesName("cuda.*|cublas.*"))))
+                            .bind("cudaStructVar"),
+                            Callback);
+  Finder.addMatcher(varDecl(isExpansionInMainFile(),
+                            hasType(pointsTo(cxxRecordDecl(
+                                             matchesName("cuda.*|cublas.*")))))
+                            .bind("cudaStructVarPtr"),
+                            Callback);
+  Finder.addMatcher(parmVarDecl(isExpansionInMainFile(),
+                                hasType(namedDecl(matchesName("cuda.*|cublas.*"))))
+                                .bind("cudaParamDecl"),
+                                Callback);
+  Finder.addMatcher(parmVarDecl(isExpansionInMainFile(),
+                                hasType(pointsTo(namedDecl(
+                                                 matchesName("cuda.*|cublas.*")))))
+                                .bind("cudaParamDeclPtr"),
+                                Callback);
+  Finder.addMatcher(expr(isExpansionInMainFile(),
+                         sizeOfExpr(hasArgumentOfType(recordType(hasDeclaration(
+                             cxxRecordDecl(matchesName("cuda.*|cublas.*")))))))
+                        .bind("cudaStructSizeOf"),
+                         Callback);
+  Finder.addMatcher(stringLiteral(isExpansionInMainFile()).bind("stringLiteral"),
+                                  Callback);
+  Finder.addMatcher(varDecl(isExpansionInMainFile(), allOf(
+                            hasAttr(attr::CUDAShared),
+                            hasType(incompleteArrayType())))
+                           .bind("cudaSharedIncompleteArrayVar"),
+                            Callback);
+}
 
 int main(int argc, const char **argv) {
-
   llvm::sys::PrintStackTraceOnErrorSignal();
-
   int Result;
-
   CommonOptionsParser OptionsParser(argc, argv, ToolTemplateCategory, llvm::cl::Required);
-
   std::vector<std::string> fileSources = OptionsParser.getSourcePathList();
-
   std::string dst = OutputFilename;
   if (dst.empty()) {
     dst = fileSources[0];
@@ -1664,84 +1783,19 @@ int main(int argc, const char **argv) {
   HipifyPPCallbacks PPCallbacks(&Tool.getReplacements());
   Cuda2HipCallback Callback(&Tool.getReplacements(), &Finder, &PPCallbacks);
 
-  Finder.addMatcher(callExpr(isExpansionInMainFile(),
-                             callee(functionDecl(matchesName("cuda.*|cublas.*"))))
-                             .bind("cudaCall"),
-                             &Callback);
-  Finder.addMatcher(cudaKernelCallExpr().bind("cudaLaunchKernel"), &Callback);
-  Finder.addMatcher(memberExpr(isExpansionInMainFile(),
-                               hasObjectExpression(hasType(cxxRecordDecl(
-                               matchesName("__cuda_builtin_")))))
-                               .bind("cudaBuiltin"),
-                               &Callback);
-  Finder.addMatcher(declRefExpr(isExpansionInMainFile(),
-                                to(enumConstantDecl(
-                                matchesName("cuda.*|cublas.*|CUDA.*|CUBLAS*"))))
-                                .bind("cudaEnumConstantRef"),
-                                &Callback);
-  Finder.addMatcher(varDecl(isExpansionInMainFile(),
-                            hasType(enumDecl()))
-                            .bind("cudaEnumConstantDecl"),
-                            &Callback);
-  Finder.addMatcher(varDecl(isExpansionInMainFile(),
-                            hasType(typedefDecl(matchesName("cuda.*|cublas.*"))))
-                            .bind("cudaTypedefVar"),
-                            &Callback);
-  // Array of elements of typedef type, Example: cudaStream_t streams[2];
-  Finder.addMatcher(varDecl(isExpansionInMainFile(),
-                            hasType(arrayType(hasElementType(typedefType(
-                            hasDeclaration(typedefDecl(matchesName("cuda.*|cublas.*"))))))))
-                            .bind("cudaTypedefVar"),
-                            &Callback);
-  Finder.addMatcher(varDecl(isExpansionInMainFile(),
-                            hasType(cxxRecordDecl(matchesName("cuda.*|cublas.*"))))
-                            .bind("cudaStructVar"),
-                            &Callback);
-  Finder.addMatcher(varDecl(isExpansionInMainFile(),
-                            hasType(pointsTo(cxxRecordDecl(
-                                             matchesName("cuda.*|cublas.*")))))
-                            .bind("cudaStructVarPtr"),
-                            &Callback);
-  Finder.addMatcher(parmVarDecl(isExpansionInMainFile(),
-                                hasType(namedDecl(matchesName("cuda.*|cublas.*"))))
-                                .bind("cudaParamDecl"),
-                                &Callback);
-  Finder.addMatcher(parmVarDecl(isExpansionInMainFile(),
-                                hasType(pointsTo(namedDecl(
-                                                 matchesName("cuda.*|cublas.*")))))
-                                .bind("cudaParamDeclPtr"),
-                                &Callback);
-  Finder.addMatcher(expr(isExpansionInMainFile(),
-                         sizeOfExpr(hasArgumentOfType(recordType(hasDeclaration(
-                             cxxRecordDecl(matchesName("cuda.*|cublas.*")))))))
-                        .bind("cudaStructSizeOf"),
-                         &Callback);
-  Finder.addMatcher(stringLiteral(isExpansionInMainFile()).bind("stringLiteral"),
-                                  &Callback);
-  Finder.addMatcher(varDecl(isExpansionInMainFile(), allOf(
-                            hasAttr(attr::CUDAShared),
-                            hasType(incompleteArrayType())))
-                           .bind("cudaSharedIncompleteArrayVar"),
-                            &Callback);
+  addAllMatchers(Finder, &Callback);
 
   auto action = newFrontendActionFactory(&Finder, &PPCallbacks);
-
-  std::vector<const char *> compilationStages;
+  std::vector<const char*> compilationStages;
   compilationStages.push_back("--cuda-host-only");
-
-  for (auto Stage : compilationStages) {
-    Tool.appendArgumentsAdjuster(
-        getInsertArgumentAdjuster(Stage, ArgumentInsertPosition::BEGIN));
-    Tool.appendArgumentsAdjuster(getInsertArgumentAdjuster("-std=c++11"));
+  Tool.appendArgumentsAdjuster(getInsertArgumentAdjuster(compilationStages[0], ArgumentInsertPosition::BEGIN));
+  Tool.appendArgumentsAdjuster(getInsertArgumentAdjuster("-std=c++11"));
 #if defined(HIPIFY_CLANG_RES)
-    Tool.appendArgumentsAdjuster(
-        getInsertArgumentAdjuster("-resource-dir=" HIPIFY_CLANG_RES));
+  Tool.appendArgumentsAdjuster(getInsertArgumentAdjuster("-resource-dir=" HIPIFY_CLANG_RES));
 #endif
-    Tool.appendArgumentsAdjuster(getClangSyntaxOnlyAdjuster());
-    Result = Tool.run(action.get());
-
-    Tool.clearArgumentsAdjusters();
-  }
+  Tool.appendArgumentsAdjuster(getClangSyntaxOnlyAdjuster());
+  Result = Tool.run(action.get());
+  Tool.clearArgumentsAdjusters();
 
   LangOptions DefaultLangOptions;
   IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts = new DiagnosticOptions();
@@ -1749,13 +1803,13 @@ int main(int argc, const char **argv) {
   DiagnosticsEngine Diagnostics(
       IntrusiveRefCntPtr<DiagnosticIDs>(new DiagnosticIDs()), &*DiagOpts,
       &DiagnosticPrinter, false);
-  SourceManager Sources(Diagnostics, Tool.getFiles());
 
   DEBUG(dbgs() << "Replacements collected by the tool:\n");
   for (const auto &r : Tool.getReplacements()) {
     DEBUG(dbgs() << r.toString() << "\n");
   }
 
+  SourceManager Sources(Diagnostics, Tool.getFiles());
   Rewriter Rewrite(Sources, DefaultLangOptions);
 
   if (!Tool.applyAllReplacements(Rewrite)) {

From 36d212c81e40f0bd8aadb6653d482a97790ef583 Mon Sep 17 00:00:00 2001
From: Evgeny Mankov <Evgeniy.Mankov@amd.com>
Date: Thu, 25 Aug 2016 19:36:37 +0300
Subject: [PATCH 12/56] clang-hipify: code refactoring - API
 (Driver/Runtime/Blas) distinguishing is added.

---
 clang-hipify/src/Cuda2Hip.cpp | 1284 +++++++++++++++++----------------
 1 file changed, 656 insertions(+), 628 deletions(-)

diff --git a/clang-hipify/src/Cuda2Hip.cpp b/clang-hipify/src/Cuda2Hip.cpp
index b07b21dfa3..fde5b7e55b 100644
--- a/clang-hipify/src/Cuda2Hip.cpp
+++ b/clang-hipify/src/Cuda2Hip.cpp
@@ -70,16 +70,27 @@ enum ConvTypes {
   CONV_OTHER,
   CONV_INCLUDE,
   CONV_INCLUDE_CUDA_MAIN_H,
+  CONV_TYPE,
   CONV_LITERAL,
-  CONV_BLAS,
+  CONV_NUMERIC_LITERAL,
   CONV_LAST
 };
 
-const char *counterNames[ConvTypes::CONV_LAST] = {
-    "dev",          "mem",    "kern",    "coord_func",   "math_func",
-    "special_func", "stream", "event",   "err",          "def",
-    "tex",          "other",  "include", "include_cuda_main_header",
-    "literal",      "blas"};
+const char *counterNames[CONV_LAST] = {
+    "dev",          "mem",     "kern",    "coord_func",   "math_func",
+    "special_func", "stream",  "event",   "err",          "def",
+    "tex",          "other",   "include", "include_cuda_main_header",
+    "type",         "literal", "numeric_literal"};
+
+enum ApiTypes {
+  API_DRIVER = 0,
+  API_RUNTIME,
+  API_BLAS,
+  API_LAST
+};
+
+const char *apiNames[API_LAST] = {
+    "CUDA API", "CUDA RT API", "CUDA BLAS API"};
 
 namespace {
 
@@ -90,560 +101,560 @@ struct cuda2hipMap {
     cudaExcludes = {"CHECK_CUDA_ERROR", "CUDA_SAFE_CALL"};
 
     // Defines
-    cuda2hipRename["__CUDACC__"] = {"__HIPCC__", CONV_DEF};
+    cuda2hipRename["__CUDACC__"] = {"__HIPCC__", CONV_DEF, API_RUNTIME};
 
     // CUDA includes
-    cuda2hipRename["cuda.h"]             = {"hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H};
-    cuda2hipRename["cuda_runtime.h"] =     {"hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H};
-    cuda2hipRename["cuda_runtime_api.h"] = {"hip_runtime_api.h", CONV_INCLUDE};
+    cuda2hipRename["cuda.h"]             = {"hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H, API_DRIVER};
+    cuda2hipRename["cuda_runtime.h"] =     {"hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H, API_RUNTIME};
+    cuda2hipRename["cuda_runtime_api.h"] = {"hip_runtime_api.h", CONV_INCLUDE, API_RUNTIME};
 
     // HIP includes
     // TODO: uncomment this when hip/cudacommon.h will be renamed to hip/hipcommon.h
-    //cuda2hipRename["cudacommon.h"] = {"hipcommon.h", CONV_INCLUDE};
+    //cuda2hipRename["cudacommon.h"] = {"hipcommon.h", CONV_INCLUDE, API_RUNTIME};
 
     // CUBLAS includes
-    cuda2hipRename["cublas.h"]           = {"hipblas.h", CONV_INCLUDE};
-    cuda2hipRename["cublas_v2.h"]        = {"hipblas.h", CONV_INCLUDE};
+    cuda2hipRename["cublas.h"]           = {"hipblas.h", CONV_INCLUDE, API_BLAS};
+    cuda2hipRename["cublas_v2.h"]        = {"hipblas.h", CONV_INCLUDE, API_BLAS};
 
     // Error codes and return types
-    cuda2hipRename["cudaError_t"]                          = {"hipError_t", CONV_ERR};
-    cuda2hipRename["cudaError"]                            = {"hipError", CONV_ERR};
-    cuda2hipRename["cudaSuccess"]                          = {"hipSuccess", CONV_ERR};
-    cuda2hipRename["cudaErrorUnknown"]                     = {"hipErrorUnknown", CONV_ERR};
-    cuda2hipRename["cudaErrorMemoryAllocation"]            = {"hipErrorMemoryAllocation", CONV_ERR};
-    cuda2hipRename["cudaErrorMemoryFree"]                  = {"hipErrorMemoryFree", CONV_ERR};
-    cuda2hipRename["cudaErrorUnknownSymbol"]               = {"hipErrorUnknownSymbol", CONV_ERR};
-    cuda2hipRename["cudaErrorOutOfResources"]              = {"hipErrorOutOfResources", CONV_ERR};
-    cuda2hipRename["cudaErrorInvalidValue"]                = {"hipErrorInvalidValue", CONV_ERR};
-    cuda2hipRename["cudaErrorInvalidResourceHandle"]       = {"hipErrorInvalidResourceHandle", CONV_ERR};
-    cuda2hipRename["cudaErrorInvalidDevice"]               = {"hipErrorInvalidDevice", CONV_ERR};
-    cuda2hipRename["cudaErrorInvalidMemcpyDirection"]      = {"hipErrorInvalidMemcpyDirection", CONV_ERR};
-    cuda2hipRename["cudaErrorInvalidDevicePointer"]        = {"hipErrorInvalidDevicePointer", CONV_ERR};
-    cuda2hipRename["cudaErrorInitializationError"]         = {"hipErrorInvalidDevicePointer", CONV_ERR};
-    cuda2hipRename["cudaErrorNoDevice"]                    = {"hipErrorNoDevice", CONV_ERR};
-    cuda2hipRename["cudaErrorNotReady"]                    = {"hipErrorNotReady", CONV_ERR};
-    cuda2hipRename["cudaErrorPeerAccessNotEnabled"]        = {"hipErrorPeerAccessNotEnabled", CONV_ERR};
-    cuda2hipRename["cudaErrorPeerAccessAlreadyEnabled"]    = {"hipErrorPeerAccessAlreadyEnabled", CONV_ERR};
+    cuda2hipRename["cudaError_t"]                          = {"hipError_t", CONV_TYPE, API_RUNTIME};
+    cuda2hipRename["cudaError"]                            = {"hipError", CONV_TYPE, API_RUNTIME};
+    cuda2hipRename["cudaSuccess"]                          = {"hipSuccess", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaErrorUnknown"]                     = {"hipErrorUnknown", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaErrorMemoryAllocation"]            = {"hipErrorMemoryAllocation", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaErrorMemoryFree"]                  = {"hipErrorMemoryFree", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaErrorUnknownSymbol"]               = {"hipErrorUnknownSymbol", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaErrorOutOfResources"]              = {"hipErrorOutOfResources", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaErrorInvalidValue"]                = {"hipErrorInvalidValue", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaErrorInvalidResourceHandle"]       = {"hipErrorInvalidResourceHandle", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaErrorInvalidDevice"]               = {"hipErrorInvalidDevice", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaErrorInvalidMemcpyDirection"]      = {"hipErrorInvalidMemcpyDirection", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaErrorInvalidDevicePointer"]        = {"hipErrorInvalidDevicePointer", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaErrorInitializationError"]         = {"hipErrorInvalidDevicePointer", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaErrorNoDevice"]                    = {"hipErrorNoDevice", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaErrorNotReady"]                    = {"hipErrorNotReady", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaErrorPeerAccessNotEnabled"]        = {"hipErrorPeerAccessNotEnabled", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaErrorPeerAccessAlreadyEnabled"]    = {"hipErrorPeerAccessAlreadyEnabled", CONV_ERR, API_RUNTIME};
     // NOTE: no corresponding error type in CUDA
-    //cuda2hipRename["cudaErrorRuntimeMemory"]               = {"hipErrorRuntimeMemory", CONV_ERR};
-    //cuda2hipRename["cudaErrorRuntimeOther"]                = {"hipErrorRuntimeOther", CONV_ERR};
-    cuda2hipRename["cudaErrorHostMemoryAlreadyRegistered"] = {"hipErrorHostMemoryAlreadyRegistered", CONV_ERR};
-    cuda2hipRename["cudaErrorHostMemoryNotRegistered"]     = {"hipErrorHostMemoryNotRegistered", CONV_ERR};
+    //cuda2hipRename["cudaErrorRuntimeMemory"]               = {"hipErrorRuntimeMemory", CONV_ERR, API_RUNTIME};
+    //cuda2hipRename["cudaErrorRuntimeOther"]                = {"hipErrorRuntimeOther", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaErrorHostMemoryAlreadyRegistered"] = {"hipErrorHostMemoryAlreadyRegistered", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaErrorHostMemoryNotRegistered"]     = {"hipErrorHostMemoryNotRegistered", CONV_ERR, API_RUNTIME};
 
     // Error API
-    cuda2hipRename["cudaGetLastError"]               = {"hipGetLastError", CONV_ERR};
-    cuda2hipRename["cudaPeekAtLastError"]            = {"hipPeekAtLastError", CONV_ERR};
-    cuda2hipRename["cudaGetErrorName"]               = {"hipGetErrorName", CONV_ERR};
-    cuda2hipRename["cudaGetErrorString"]             = {"hipGetErrorString", CONV_ERR};
+    cuda2hipRename["cudaGetLastError"]               = {"hipGetLastError", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaPeekAtLastError"]            = {"hipPeekAtLastError", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaGetErrorName"]               = {"hipGetErrorName", CONV_ERR, API_RUNTIME};
+    cuda2hipRename["cudaGetErrorString"]             = {"hipGetErrorString", CONV_ERR, API_RUNTIME};
 
     // Memcpy
-    cuda2hipRename["cudaMemcpy"]               = {"hipMemcpy", CONV_MEM};
-    cuda2hipRename["cudaMemcpyToSymbol"]       = {"hipMemcpyToSymbol", CONV_MEM};
-    cuda2hipRename["cudaMemset"]               = {"hipMemset", CONV_MEM};
-    cuda2hipRename["cudaMemsetAsync"]          = {"hipMemsetAsync", CONV_MEM};
-    cuda2hipRename["cudaMemcpyAsync"]          = {"hipMemcpyAsync", CONV_MEM};
-    cuda2hipRename["cudaMemGetInfo"]           = {"hipMemGetInfo", CONV_MEM};
+    cuda2hipRename["cudaMemcpy"]               = {"hipMemcpy", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaMemcpyToSymbol"]       = {"hipMemcpyToSymbol", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaMemset"]               = {"hipMemset", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaMemsetAsync"]          = {"hipMemsetAsync", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaMemcpyAsync"]          = {"hipMemcpyAsync", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaMemGetInfo"]           = {"hipMemGetInfo", CONV_MEM, API_RUNTIME};
     // Memcpy kind
-    cuda2hipRename["cudaMemcpyKind"]           = {"hipMemcpyKind", CONV_MEM};
-    cuda2hipRename["cudaMemcpyHostToHost"]     = {"hipMemcpyHostToHost", CONV_MEM};
-    cuda2hipRename["cudaMemcpyHostToDevice"]   = {"hipMemcpyHostToDevice", CONV_MEM};
-    cuda2hipRename["cudaMemcpyDeviceToHost"]   = {"hipMemcpyDeviceToHost", CONV_MEM};
-    cuda2hipRename["cudaMemcpyDeviceToDevice"] = {"hipMemcpyDeviceToDevice", CONV_MEM};
-    cuda2hipRename["cudaMemcpyDefault"]        = {"hipMemcpyDefault", CONV_MEM};
+    cuda2hipRename["cudaMemcpyKind"]           = {"hipMemcpyKind", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaMemcpyHostToHost"]     = {"hipMemcpyHostToHost", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaMemcpyHostToDevice"]   = {"hipMemcpyHostToDevice", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaMemcpyDeviceToHost"]   = {"hipMemcpyDeviceToHost", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaMemcpyDeviceToDevice"] = {"hipMemcpyDeviceToDevice", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaMemcpyDefault"]        = {"hipMemcpyDefault", CONV_MEM, API_RUNTIME};
 
     // Memory management
-    cuda2hipRename["cudaMalloc"]         = {"hipMalloc", CONV_MEM};
-    cuda2hipRename["cudaMallocHost"]     = {"hipHostMalloc", CONV_MEM};
-    cuda2hipRename["cudaFree"]           = {"hipFree", CONV_MEM};
-    cuda2hipRename["cudaFreeHost"]       = {"hipHostFree", CONV_MEM};
-    cuda2hipRename["cudaHostRegister"]   = {"hipHostRegister", CONV_MEM};
-    cuda2hipRename["cudaHostUnregister"] = {"hipHostUnregister", CONV_MEM};
+    cuda2hipRename["cudaMalloc"]         = {"hipMalloc", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaMallocHost"]     = {"hipHostMalloc", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaFree"]           = {"hipFree", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaFreeHost"]       = {"hipHostFree", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaHostRegister"]   = {"hipHostRegister", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaHostUnregister"] = {"hipHostUnregister", CONV_MEM, API_RUNTIME};
 
     // Memory types
-    cuda2hipRename["cudaMemoryType"]       = {"hipMemoryType", CONV_MEM};
-    cuda2hipRename["cudaMemoryTypeHost"]   = {"hipMemoryTypeHost", CONV_MEM};
-    cuda2hipRename["cudaMemoryTypeDevice"] = {"hipMemoryTypeDevice", CONV_MEM};
+    cuda2hipRename["cudaMemoryType"]       = {"hipMemoryType", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaMemoryTypeHost"]   = {"hipMemoryTypeHost", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaMemoryTypeDevice"] = {"hipMemoryTypeDevice", CONV_MEM, API_RUNTIME};
 
     // Host Malloc Flags
-    cuda2hipRename["cudaHostAllocDefault"]       = {"hipHostMallocDefault", CONV_MEM};
-    cuda2hipRename["cudaHostAllocPortable"]      = {"hipHostMallocPortable", CONV_MEM};
-    cuda2hipRename["cudaHostAllocMapped"]        = {"hipHostMallocMapped", CONV_MEM};
-    cuda2hipRename["cudaHostAllocWriteCombined"] = {"hipHostMallocWriteCombined", CONV_MEM};
+    cuda2hipRename["cudaHostAllocDefault"]       = {"hipHostMallocDefault", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaHostAllocPortable"]      = {"hipHostMallocPortable", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaHostAllocMapped"]        = {"hipHostMallocMapped", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaHostAllocWriteCombined"] = {"hipHostMallocWriteCombined", CONV_MEM, API_RUNTIME};
 
     // Host Register Flags
-    cuda2hipRename["cudaHostGetFlags"]         = {"hipHostGetFlags", CONV_MEM};
-    cuda2hipRename["cudaHostRegisterDefault"]  = {"hipHostRegisterDefault", CONV_MEM};
-    cuda2hipRename["cudaHostRegisterPortable"] = {"hipHostRegisterPortable", CONV_MEM};
-    cuda2hipRename["cudaHostRegisterMapped"]   = {"hipHostRegisterMapped", CONV_MEM};
-    cuda2hipRename["cudaHostRegisterIoMemory"] = {"hipHostRegisterIoMemory", CONV_MEM};
+    cuda2hipRename["cudaHostGetFlags"]         = {"hipHostGetFlags", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaHostRegisterDefault"]  = {"hipHostRegisterDefault", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaHostRegisterPortable"] = {"hipHostRegisterPortable", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaHostRegisterMapped"]   = {"hipHostRegisterMapped", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaHostRegisterIoMemory"] = {"hipHostRegisterIoMemory", CONV_MEM, API_RUNTIME};
 
     // Coordinate Indexing and Dimensions
-    cuda2hipRename["threadIdx.x"] = {"hipThreadIdx_x", CONV_COORD_FUNC};
-    cuda2hipRename["threadIdx.y"] = {"hipThreadIdx_y", CONV_COORD_FUNC};
-    cuda2hipRename["threadIdx.z"] = {"hipThreadIdx_z", CONV_COORD_FUNC};
+    cuda2hipRename["threadIdx.x"] = {"hipThreadIdx_x", CONV_COORD_FUNC, API_RUNTIME};
+    cuda2hipRename["threadIdx.y"] = {"hipThreadIdx_y", CONV_COORD_FUNC, API_RUNTIME};
+    cuda2hipRename["threadIdx.z"] = {"hipThreadIdx_z", CONV_COORD_FUNC, API_RUNTIME};
 
-    cuda2hipRename["blockIdx.x"]  = {"hipBlockIdx_x", CONV_COORD_FUNC};
-    cuda2hipRename["blockIdx.y"]  = {"hipBlockIdx_y", CONV_COORD_FUNC};
-    cuda2hipRename["blockIdx.z"]  = {"hipBlockIdx_z", CONV_COORD_FUNC};
+    cuda2hipRename["blockIdx.x"]  = {"hipBlockIdx_x", CONV_COORD_FUNC, API_RUNTIME};
+    cuda2hipRename["blockIdx.y"]  = {"hipBlockIdx_y", CONV_COORD_FUNC, API_RUNTIME};
+    cuda2hipRename["blockIdx.z"]  = {"hipBlockIdx_z", CONV_COORD_FUNC, API_RUNTIME};
 
-    cuda2hipRename["blockDim.x"]  = {"hipBlockDim_x", CONV_COORD_FUNC};
-    cuda2hipRename["blockDim.y"]  = {"hipBlockDim_y", CONV_COORD_FUNC};
-    cuda2hipRename["blockDim.z"]  = {"hipBlockDim_z", CONV_COORD_FUNC};
+    cuda2hipRename["blockDim.x"]  = {"hipBlockDim_x", CONV_COORD_FUNC, API_RUNTIME};
+    cuda2hipRename["blockDim.y"]  = {"hipBlockDim_y", CONV_COORD_FUNC, API_RUNTIME};
+    cuda2hipRename["blockDim.z"]  = {"hipBlockDim_z", CONV_COORD_FUNC, API_RUNTIME};
 
-    cuda2hipRename["gridDim.x"]   = {"hipGridDim_x", CONV_COORD_FUNC};
-    cuda2hipRename["gridDim.y"]   = {"hipGridDim_y", CONV_COORD_FUNC};
-    cuda2hipRename["gridDim.z"]   = {"hipGridDim_z", CONV_COORD_FUNC};
+    cuda2hipRename["gridDim.x"]   = {"hipGridDim_x", CONV_COORD_FUNC, API_RUNTIME};
+    cuda2hipRename["gridDim.y"]   = {"hipGridDim_y", CONV_COORD_FUNC, API_RUNTIME};
+    cuda2hipRename["gridDim.z"]   = {"hipGridDim_z", CONV_COORD_FUNC, API_RUNTIME};
 
-    cuda2hipRename["blockIdx.x"]  = {"hipBlockIdx_x", CONV_COORD_FUNC};
-    cuda2hipRename["blockIdx.y"]  = {"hipBlockIdx_y", CONV_COORD_FUNC};
-    cuda2hipRename["blockIdx.z"]  = {"hipBlockIdx_z", CONV_COORD_FUNC};
+    cuda2hipRename["blockIdx.x"]  = {"hipBlockIdx_x", CONV_COORD_FUNC, API_RUNTIME};
+    cuda2hipRename["blockIdx.y"]  = {"hipBlockIdx_y", CONV_COORD_FUNC, API_RUNTIME};
+    cuda2hipRename["blockIdx.z"]  = {"hipBlockIdx_z", CONV_COORD_FUNC, API_RUNTIME};
 
-    cuda2hipRename["blockDim.x"]  = {"hipBlockDim_x", CONV_COORD_FUNC};
-    cuda2hipRename["blockDim.y"]  = {"hipBlockDim_y", CONV_COORD_FUNC};
-    cuda2hipRename["blockDim.z"]  = {"hipBlockDim_z", CONV_COORD_FUNC};
+    cuda2hipRename["blockDim.x"]  = {"hipBlockDim_x", CONV_COORD_FUNC, API_RUNTIME};
+    cuda2hipRename["blockDim.y"]  = {"hipBlockDim_y", CONV_COORD_FUNC, API_RUNTIME};
+    cuda2hipRename["blockDim.z"]  = {"hipBlockDim_z", CONV_COORD_FUNC, API_RUNTIME};
 
-    cuda2hipRename["gridDim.x"]   = {"hipGridDim_x", CONV_COORD_FUNC};
-    cuda2hipRename["gridDim.y"]   = {"hipGridDim_y", CONV_COORD_FUNC};
-    cuda2hipRename["gridDim.z"]   = {"hipGridDim_z", CONV_COORD_FUNC};
+    cuda2hipRename["gridDim.x"]   = {"hipGridDim_x", CONV_COORD_FUNC, API_RUNTIME};
+    cuda2hipRename["gridDim.y"]   = {"hipGridDim_y", CONV_COORD_FUNC, API_RUNTIME};
+    cuda2hipRename["gridDim.z"]   = {"hipGridDim_z", CONV_COORD_FUNC, API_RUNTIME};
 
-    cuda2hipRename["warpSize"]    = {"hipWarpSize", CONV_SPECIAL_FUNC};
+    cuda2hipRename["warpSize"]    = {"hipWarpSize", CONV_SPECIAL_FUNC, API_RUNTIME};
 
     // Events
-    cuda2hipRename["cudaEvent_t"]              = {"hipEvent_t", CONV_EVENT};
-    cuda2hipRename["cudaEventCreate"]          = {"hipEventCreate", CONV_EVENT};
-    cuda2hipRename["cudaEventCreateWithFlags"] = {"hipEventCreateWithFlags", CONV_EVENT};
-    cuda2hipRename["cudaEventDestroy"]         = {"hipEventDestroy", CONV_EVENT};
-    cuda2hipRename["cudaEventRecord"]          = {"hipEventRecord", CONV_EVENT};
-    cuda2hipRename["cudaEventElapsedTime"]     = {"hipEventElapsedTime", CONV_EVENT};
-    cuda2hipRename["cudaEventSynchronize"]     = {"hipEventSynchronize", CONV_EVENT};
-    cuda2hipRename["cudaEventQuery"]           = {"hipEventQuery", CONV_EVENT};
+    cuda2hipRename["cudaEvent_t"]              = {"hipEvent_t", CONV_TYPE, API_RUNTIME};
+    cuda2hipRename["cudaEventCreate"]          = {"hipEventCreate", CONV_EVENT, API_RUNTIME};
+    cuda2hipRename["cudaEventCreateWithFlags"] = {"hipEventCreateWithFlags", CONV_EVENT, API_RUNTIME};
+    cuda2hipRename["cudaEventDestroy"]         = {"hipEventDestroy", CONV_EVENT, API_RUNTIME};
+    cuda2hipRename["cudaEventRecord"]          = {"hipEventRecord", CONV_EVENT, API_RUNTIME};
+    cuda2hipRename["cudaEventElapsedTime"]     = {"hipEventElapsedTime", CONV_EVENT, API_RUNTIME};
+    cuda2hipRename["cudaEventSynchronize"]     = {"hipEventSynchronize", CONV_EVENT, API_RUNTIME};
+    cuda2hipRename["cudaEventQuery"]           = {"hipEventQuery", CONV_EVENT, API_RUNTIME};
     // Event Flags
-    cuda2hipRename["cudaEventDefault"]         = {"hipEventDefault", CONV_EVENT};
-    cuda2hipRename["cudaEventBlockingSync"]    = {"hipEventBlockingSync", CONV_EVENT};
-    cuda2hipRename["cudaEventDisableTiming"]   = {"hipEventDisableTiming", CONV_EVENT};
-    cuda2hipRename["cudaEventInterprocess"]    = {"hipEventInterprocess", CONV_EVENT};
+    cuda2hipRename["cudaEventDefault"]         = {"hipEventDefault", CONV_EVENT, API_RUNTIME};
+    cuda2hipRename["cudaEventBlockingSync"]    = {"hipEventBlockingSync", CONV_EVENT, API_RUNTIME};
+    cuda2hipRename["cudaEventDisableTiming"]   = {"hipEventDisableTiming", CONV_EVENT, API_RUNTIME};
+    cuda2hipRename["cudaEventInterprocess"]    = {"hipEventInterprocess", CONV_EVENT, API_RUNTIME};
 
     // Streams
-    cuda2hipRename["cudaStream_t"]              = {"hipStream_t", CONV_STREAM};
-    cuda2hipRename["cudaStreamCreate"]          = {"hipStreamCreate", CONV_STREAM};
-    cuda2hipRename["cudaStreamCreateWithFlags"] = {"hipStreamCreateWithFlags", CONV_STREAM};
-    cuda2hipRename["cudaStreamDestroy"]         = {"hipStreamDestroy", CONV_STREAM};
-    cuda2hipRename["cudaStreamWaitEvent"]       = {"hipStreamWaitEvent", CONV_STREAM};
-    cuda2hipRename["cudaStreamSynchronize"]     = {"hipStreamSynchronize", CONV_STREAM};
+    cuda2hipRename["cudaStream_t"]              = {"hipStream_t", CONV_TYPE, API_RUNTIME};
+    cuda2hipRename["cudaStreamCreate"]          = {"hipStreamCreate", CONV_STREAM, API_RUNTIME};
+    cuda2hipRename["cudaStreamCreateWithFlags"] = {"hipStreamCreateWithFlags", CONV_STREAM, API_RUNTIME};
+    cuda2hipRename["cudaStreamDestroy"]         = {"hipStreamDestroy", CONV_STREAM, API_RUNTIME};
+    cuda2hipRename["cudaStreamWaitEvent"]       = {"hipStreamWaitEvent", CONV_STREAM, API_RUNTIME};
+    cuda2hipRename["cudaStreamSynchronize"]     = {"hipStreamSynchronize", CONV_STREAM, API_RUNTIME};
     // Stream Flags
-    cuda2hipRename["cudaStreamGetFlags"]        = {"hipStreamGetFlags", CONV_STREAM};
-    cuda2hipRename["cudaStreamDefault"]         = {"hipStreamDefault", CONV_STREAM};
-    cuda2hipRename["cudaStreamNonBlocking"]     = {"hipStreamNonBlocking", CONV_STREAM};
+    cuda2hipRename["cudaStreamGetFlags"]        = {"hipStreamGetFlags", CONV_STREAM, API_RUNTIME};
+    cuda2hipRename["cudaStreamDefault"]         = {"hipStreamDefault", CONV_STREAM, API_RUNTIME};
+    cuda2hipRename["cudaStreamNonBlocking"]     = {"hipStreamNonBlocking", CONV_STREAM, API_RUNTIME};
 
     // Other synchronization
-    cuda2hipRename["cudaDeviceSynchronize"] = {"hipDeviceSynchronize", CONV_DEV};
+    cuda2hipRename["cudaDeviceSynchronize"] = {"hipDeviceSynchronize", CONV_DEV, API_RUNTIME};
     // translate deprecated cudaThreadSynchronize
-    cuda2hipRename["cudaThreadSynchronize"] = {"hipDeviceSynchronize", CONV_DEV};
-    cuda2hipRename["cudaDeviceReset"]       = {"hipDeviceReset", CONV_DEV};
+    cuda2hipRename["cudaThreadSynchronize"] = {"hipDeviceSynchronize", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDeviceReset"]       = {"hipDeviceReset", CONV_DEV, API_RUNTIME};
     // translate deprecated cudaThreadExit
-    cuda2hipRename["cudaThreadExit"]        = {"hipDeviceReset", CONV_DEV};
-    cuda2hipRename["cudaSetDevice"]         = {"hipSetDevice", CONV_DEV};
-    cuda2hipRename["cudaGetDevice"]         = {"hipGetDevice", CONV_DEV};
+    cuda2hipRename["cudaThreadExit"]        = {"hipDeviceReset", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaSetDevice"]         = {"hipSetDevice", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaGetDevice"]         = {"hipGetDevice", CONV_DEV, API_RUNTIME};
 
     // Attributes
-    cuda2hipRename["cudaDeviceAttr"]          = {"hipDeviceAttribute_t", CONV_DEV};
-    cuda2hipRename["cudaDeviceGetAttribute"]  = {"hipDeviceGetAttribute", CONV_DEV};
+    cuda2hipRename["cudaDeviceAttr"]          = {"hipDeviceAttribute_t", CONV_TYPE, API_RUNTIME};
+    cuda2hipRename["cudaDeviceGetAttribute"]  = {"hipDeviceGetAttribute", CONV_DEV, API_RUNTIME};
 
-    cuda2hipRename["cudaDevAttrMaxThreadsPerBlock"]               = {"hipDeviceAttributeMaxThreadsPerBlock", CONV_DEV};
-    cuda2hipRename["cudaDevAttrMaxBlockDimX"]                     = {"hipDeviceAttributeMaxBlockDimX", CONV_DEV};
-    cuda2hipRename["cudaDevAttrMaxBlockDimY"]                     = {"hipDeviceAttributeMaxBlockDimY", CONV_DEV};
-    cuda2hipRename["cudaDevAttrMaxBlockDimZ"]                     = {"hipDeviceAttributeMaxBlockDimZ", CONV_DEV};
-    cuda2hipRename["cudaDevAttrMaxGridDimX"]                      = {"hipDeviceAttributeMaxGridDimX", CONV_DEV};
-    cuda2hipRename["cudaDevAttrMaxGridDimY"]                      = {"hipDeviceAttributeMaxGridDimY", CONV_DEV};
-    cuda2hipRename["cudaDevAttrMaxGridDimZ"]                      = {"hipDeviceAttributeMaxGridDimZ", CONV_DEV};
-    cuda2hipRename["cudaDevAttrMaxSharedMemoryPerBlock"]          = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV};
-    cuda2hipRename["cudaDevAttrTotalConstantMemory"]              = {"hipDeviceAttributeTotalConstantMemory", CONV_DEV};
-    cuda2hipRename["cudaDevAttrWarpSize"]                         = {"hipDeviceAttributeWarpSize", CONV_DEV};
-    cuda2hipRename["cudaDevAttrMaxRegistersPerBlock"]             = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV};
-    cuda2hipRename["cudaDevAttrClockRate"]                        = {"hipDeviceAttributeClockRate", CONV_DEV};
-    cuda2hipRename["cudaDevAttrMemoryClockRate"]                  = {"hipDeviceAttributeMemoryClockRate", CONV_DEV};
-    cuda2hipRename["cudaDevAttrGlobalMemoryBusWidth"]             = {"hipDeviceAttributeMemoryBusWidth", CONV_DEV};
-    cuda2hipRename["cudaDevAttrMultiProcessorCount"]              = {"hipDeviceAttributeMultiprocessorCount", CONV_DEV};
-    cuda2hipRename["cudaDevAttrComputeMode"]                      = {"hipDeviceAttributeComputeMode", CONV_DEV};
-    cuda2hipRename["cudaDevAttrL2CacheSize"]                      = {"hipDeviceAttributeL2CacheSize", CONV_DEV};
-    cuda2hipRename["cudaDevAttrMaxThreadsPerMultiProcessor"]      = {"hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_DEV};
-    cuda2hipRename["cudaDevAttrComputeCapabilityMajor"]           = {"hipDeviceAttributeComputeCapabilityMajor", CONV_DEV};
-    cuda2hipRename["cudaDevAttrConcurrentKernels"]                = {"hipDeviceAttributeConcurrentKernels", CONV_DEV};
-    cuda2hipRename["cudaDevAttrPciBusId"]                         = {"hipDeviceAttributePciBusId", CONV_DEV};
-    cuda2hipRename["cudaDevAttrPciDeviceId"]                      = {"hipDeviceAttributePciDeviceId", CONV_DEV};
-    cuda2hipRename["cudaDevAttrMaxSharedMemoryPerMultiprocessor"] = {"hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", CONV_DEV};
-    cuda2hipRename["cudaDevAttrIsMultiGpuBoard"]                  = {"hipDeviceAttributeIsMultiGpuBoard", CONV_DEV};
+    cuda2hipRename["cudaDevAttrMaxThreadsPerBlock"]               = {"hipDeviceAttributeMaxThreadsPerBlock", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMaxBlockDimX"]                     = {"hipDeviceAttributeMaxBlockDimX", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMaxBlockDimY"]                     = {"hipDeviceAttributeMaxBlockDimY", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMaxBlockDimZ"]                     = {"hipDeviceAttributeMaxBlockDimZ", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMaxGridDimX"]                      = {"hipDeviceAttributeMaxGridDimX", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMaxGridDimY"]                      = {"hipDeviceAttributeMaxGridDimY", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMaxGridDimZ"]                      = {"hipDeviceAttributeMaxGridDimZ", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMaxSharedMemoryPerBlock"]          = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrTotalConstantMemory"]              = {"hipDeviceAttributeTotalConstantMemory", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrWarpSize"]                         = {"hipDeviceAttributeWarpSize", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMaxRegistersPerBlock"]             = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrClockRate"]                        = {"hipDeviceAttributeClockRate", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMemoryClockRate"]                  = {"hipDeviceAttributeMemoryClockRate", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrGlobalMemoryBusWidth"]             = {"hipDeviceAttributeMemoryBusWidth", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMultiProcessorCount"]              = {"hipDeviceAttributeMultiprocessorCount", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrComputeMode"]                      = {"hipDeviceAttributeComputeMode", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrL2CacheSize"]                      = {"hipDeviceAttributeL2CacheSize", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMaxThreadsPerMultiProcessor"]      = {"hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrComputeCapabilityMajor"]           = {"hipDeviceAttributeComputeCapabilityMajor", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrConcurrentKernels"]                = {"hipDeviceAttributeConcurrentKernels", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrPciBusId"]                         = {"hipDeviceAttributePciBusId", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrPciDeviceId"]                      = {"hipDeviceAttributePciDeviceId", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMaxSharedMemoryPerMultiprocessor"] = {"hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrIsMultiGpuBoard"]                  = {"hipDeviceAttributeIsMultiGpuBoard", CONV_DEV, API_RUNTIME};
 
     // Pointer Attributes
-    cuda2hipRename["cudaPointerAttributes"]      = {"hipPointerAttribute_t", CONV_MEM};
-    cuda2hipRename["cudaPointerGetAttributes"]   = {"hipPointerGetAttributes", CONV_MEM};
+    cuda2hipRename["cudaPointerAttributes"]      = {"hipPointerAttribute_t", CONV_TYPE, API_RUNTIME};
+    cuda2hipRename["cudaPointerGetAttributes"]   = {"hipPointerGetAttributes", CONV_MEM, API_RUNTIME};
 
-    cuda2hipRename["cudaHostGetDevicePointer"]   = {"hipHostGetDevicePointer", CONV_MEM};
+    cuda2hipRename["cudaHostGetDevicePointer"]   = {"hipHostGetDevicePointer", CONV_MEM, API_RUNTIME};
 
     // Device
-    cuda2hipRename["cudaDeviceProp"]            = {"hipDeviceProp_t", CONV_DEV};
-    cuda2hipRename["cudaGetDeviceProperties"]   = {"hipGetDeviceProperties", CONV_DEV};
+    cuda2hipRename["cudaDeviceProp"]            = {"hipDeviceProp_t", CONV_TYPE, API_RUNTIME};
+    cuda2hipRename["cudaGetDeviceProperties"]   = {"hipGetDeviceProperties", CONV_DEV, API_RUNTIME};
 
     // Device Flags
-    cuda2hipRename["cudaSetDeviceFlags"]               = {"hipSetDeviceFlags", CONV_DEV};
-    cuda2hipRename["cudaDeviceScheduleAuto"]           = {"hipDeviceScheduleAuto", CONV_DEV};
-    cuda2hipRename["cudaDeviceScheduleSpin"]           = {"hipDeviceScheduleSpin", CONV_DEV};
-    cuda2hipRename["cudaDeviceScheduleYield"]          = {"hipDeviceScheduleYield", CONV_DEV};
+    cuda2hipRename["cudaSetDeviceFlags"]               = {"hipSetDeviceFlags", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDeviceScheduleAuto"]           = {"hipDeviceScheduleAuto", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDeviceScheduleSpin"]           = {"hipDeviceScheduleSpin", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDeviceScheduleYield"]          = {"hipDeviceScheduleYield", CONV_DEV, API_RUNTIME};
     // deprecated as of CUDA 4.0 and replaced with cudaDeviceScheduleBlockingSync
-    cuda2hipRename["cudaDeviceBlockingSync"]           = {"hipDeviceBlockingSync", CONV_DEV};
+    cuda2hipRename["cudaDeviceBlockingSync"]           = {"hipDeviceBlockingSync", CONV_DEV, API_RUNTIME};
     // unsupported yet
-    //cuda2hipRename["cudaDeviceScheduleBlockingSync"] = {"hipDeviceScheduleBlockingSync", CONV_DEV};
-    //cuda2hipRename["cudaDeviceScheduleMask"]         = {"hipDeviceScheduleMask", CONV_DEV};
-    cuda2hipRename["cudaDeviceMapHost"]                = {"hipDeviceMapHost", CONV_DEV};
+    //cuda2hipRename["cudaDeviceScheduleBlockingSync"] = {"hipDeviceScheduleBlockingSync", CONV_DEV, API_RUNTIME};
+    //cuda2hipRename["cudaDeviceScheduleMask"]         = {"hipDeviceScheduleMask", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDeviceMapHost"]                = {"hipDeviceMapHost", CONV_DEV, API_RUNTIME};
     // unsupported yet
-    //cuda2hipRename["cudaDeviceLmemResizeToMax"]      = {"hipDeviceLmemResizeToMax", CONV_DEV};
-    //cuda2hipRename["cudaDeviceMask"]                 = {"hipDeviceMask", CONV_DEV};
+    //cuda2hipRename["cudaDeviceLmemResizeToMax"]      = {"hipDeviceLmemResizeToMax", CONV_DEV, API_RUNTIME};
+    //cuda2hipRename["cudaDeviceMask"]                 = {"hipDeviceMask", CONV_DEV, API_RUNTIME};
 
     // Cache config
-    cuda2hipRename["cudaDeviceSetCacheConfig"]  = {"hipDeviceSetCacheConfig", CONV_DEV};
+    cuda2hipRename["cudaDeviceSetCacheConfig"]  = {"hipDeviceSetCacheConfig", CONV_DEV, API_RUNTIME};
     // translate deprecated
-    cuda2hipRename["cudaThreadSetCacheConfig"]  = {"hipDeviceSetCacheConfig", CONV_DEV};
-    cuda2hipRename["cudaDeviceGetCacheConfig"]  = {"hipDeviceGetCacheConfig", CONV_DEV};
+    cuda2hipRename["cudaThreadSetCacheConfig"]  = {"hipDeviceSetCacheConfig", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDeviceGetCacheConfig"]  = {"hipDeviceGetCacheConfig", CONV_DEV, API_RUNTIME};
     // translate deprecated
-    cuda2hipRename["cudaThreadGetCacheConfig"]  = {"hipDeviceGetCacheConfig", CONV_DEV};
-    cuda2hipRename["cudaFuncCache"]             = {"hipFuncCache", CONV_DEV};
-    cuda2hipRename["cudaFuncCachePreferNone"]   = {"hipFuncCachePreferNone", CONV_DEV};
-    cuda2hipRename["cudaFuncCachePreferShared"] = {"hipFuncCachePreferShared", CONV_DEV};
-    cuda2hipRename["cudaFuncCachePreferL1"]     = {"hipFuncCachePreferL1", CONV_DEV};
-    cuda2hipRename["cudaFuncCachePreferEqual"]  = {"hipFuncCachePreferEqual", CONV_DEV};
-    cuda2hipRename["cudaFuncSetCacheConfig"]    = {"hipFuncSetCacheConfig", CONV_DEV};
+    cuda2hipRename["cudaThreadGetCacheConfig"]  = {"hipDeviceGetCacheConfig", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaFuncCache"]             = {"hipFuncCache", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaFuncCachePreferNone"]   = {"hipFuncCachePreferNone", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaFuncCachePreferShared"] = {"hipFuncCachePreferShared", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaFuncCachePreferL1"]     = {"hipFuncCachePreferL1", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaFuncCachePreferEqual"]  = {"hipFuncCachePreferEqual", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaFuncSetCacheConfig"]    = {"hipFuncSetCacheConfig", CONV_DEV, API_RUNTIME};
 
     // Driver/Runtime
-    cuda2hipRename["cudaDriverGetVersion"] = {"hipDriverGetVersion", CONV_DEV};
-    cuda2hipRename["cudaGetDeviceCount"]   = {"hipGetDeviceCount", CONV_DEV};
+    cuda2hipRename["cudaDriverGetVersion"] = {"hipDriverGetVersion", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaGetDeviceCount"]   = {"hipGetDeviceCount", CONV_DEV, API_RUNTIME};
     // unsupported yet
-    //cuda2hipRename["cudaRuntimeGetVersion"] = {"hipRuntimeGetVersion", CONV_DEV};
+    //cuda2hipRename["cudaRuntimeGetVersion"] = {"hipRuntimeGetVersion", CONV_DEV, API_RUNTIME};
 
     // Peer2Peer
-    cuda2hipRename["cudaDeviceCanAccessPeer"]     = {"hipDeviceCanAccessPeer", CONV_DEV};
-    cuda2hipRename["cudaDeviceDisablePeerAccess"] = {"hipDeviceDisablePeerAccess", CONV_DEV};
-    cuda2hipRename["cudaDeviceEnablePeerAccess"]  = {"hipDeviceEnablePeerAccess", CONV_DEV};
-    cuda2hipRename["cudaMemcpyPeerAsync"]         = {"hipMemcpyPeerAsync", CONV_MEM};
-    cuda2hipRename["cudaMemcpyPeer"]              = {"hipMemcpyPeer", CONV_MEM};
+    cuda2hipRename["cudaDeviceCanAccessPeer"]     = {"hipDeviceCanAccessPeer", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDeviceDisablePeerAccess"] = {"hipDeviceDisablePeerAccess", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDeviceEnablePeerAccess"]  = {"hipDeviceEnablePeerAccess", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaMemcpyPeerAsync"]         = {"hipMemcpyPeerAsync", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaMemcpyPeer"]              = {"hipMemcpyPeer", CONV_MEM, API_RUNTIME};
 
     // Shared memory
-    cuda2hipRename["cudaDeviceSetSharedMemConfig"]   = {"hipDeviceSetSharedMemConfig", CONV_DEV};
+    cuda2hipRename["cudaDeviceSetSharedMemConfig"]   = {"hipDeviceSetSharedMemConfig", CONV_DEV, API_RUNTIME};
     // translate deprecated
-    cuda2hipRename["cudaThreadSetSharedMemConfig"]   = {"hipDeviceSetSharedMemConfig", CONV_DEV};
-    cuda2hipRename["cudaDeviceGetSharedMemConfig"]   = {"hipDeviceGetSharedMemConfig", CONV_DEV};
+    cuda2hipRename["cudaThreadSetSharedMemConfig"]   = {"hipDeviceSetSharedMemConfig", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDeviceGetSharedMemConfig"]   = {"hipDeviceGetSharedMemConfig", CONV_DEV, API_RUNTIME};
     // translate deprecated
-    cuda2hipRename["cudaThreadGetSharedMemConfig"]   = {"hipDeviceGetSharedMemConfig", CONV_DEV};
-    cuda2hipRename["cudaSharedMemConfig"]            = {"hipSharedMemConfig", CONV_DEV};
-    cuda2hipRename["cudaSharedMemBankSizeDefault"]   = {"hipSharedMemBankSizeDefault", CONV_DEV};
-    cuda2hipRename["cudaSharedMemBankSizeFourByte"]  = {"hipSharedMemBankSizeFourByte", CONV_DEV};
-    cuda2hipRename["cudaSharedMemBankSizeEightByte"] = {"hipSharedMemBankSizeEightByte", CONV_DEV};
+    cuda2hipRename["cudaThreadGetSharedMemConfig"]   = {"hipDeviceGetSharedMemConfig", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaSharedMemConfig"]            = {"hipSharedMemConfig", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaSharedMemBankSizeDefault"]   = {"hipSharedMemBankSizeDefault", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaSharedMemBankSizeFourByte"]  = {"hipSharedMemBankSizeFourByte", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaSharedMemBankSizeEightByte"] = {"hipSharedMemBankSizeEightByte", CONV_DEV, API_RUNTIME};
 
     // Profiler
     // unsupported yet
-    //cuda2hipRename["cudaProfilerInitialize"]  = {"hipProfilerInitialize", CONV_OTHER};
-    cuda2hipRename["cudaProfilerStart"]       = {"hipProfilerStart", CONV_OTHER};
-    cuda2hipRename["cudaProfilerStop"]        = {"hipProfilerStop", CONV_OTHER};
-    cuda2hipRename["cudaChannelFormatDesc"]   = {"hipChannelFormatDesc", CONV_TEX};
-    cuda2hipRename["cudaFilterModePoint"]     = {"hipFilterModePoint", CONV_TEX};
-    cuda2hipRename["cudaReadModeElementType"] = {"hipReadModeElementType", CONV_TEX};
+    //cuda2hipRename["cudaProfilerInitialize"]  = {"hipProfilerInitialize", CONV_OTHER, API_RUNTIME};
+    cuda2hipRename["cudaProfilerStart"]       = {"hipProfilerStart", CONV_OTHER, API_RUNTIME};
+    cuda2hipRename["cudaProfilerStop"]        = {"hipProfilerStop", CONV_OTHER, API_RUNTIME};
+    cuda2hipRename["cudaChannelFormatDesc"]   = {"hipChannelFormatDesc", CONV_TEX, API_RUNTIME};
+    cuda2hipRename["cudaFilterModePoint"]     = {"hipFilterModePoint", CONV_TEX, API_RUNTIME};
+    cuda2hipRename["cudaReadModeElementType"] = {"hipReadModeElementType", CONV_TEX, API_RUNTIME};
 
     // Channel descriptor
-    cuda2hipRename["cudaCreateChannelDesc"]   = {"hipCreateChannelDesc", CONV_TEX};
-    cuda2hipRename["cudaBindTexture"]         = {"hipBindTexture", CONV_TEX};
-    cuda2hipRename["cudaUnbindTexture"]       = {"hipUnbindTexture", CONV_TEX};
+    cuda2hipRename["cudaCreateChannelDesc"]   = {"hipCreateChannelDesc", CONV_TEX, API_RUNTIME};
+    cuda2hipRename["cudaBindTexture"]         = {"hipBindTexture", CONV_TEX, API_RUNTIME};
+    cuda2hipRename["cudaUnbindTexture"]       = {"hipUnbindTexture", CONV_TEX, API_RUNTIME};
 
     //---------------------------------------BLAS-------------------------------------//
     // Blas types
-    cuda2hipRename["cublasHandle_t"]                 = {"hipblasHandle_t", CONV_BLAS};
+    cuda2hipRename["cublasHandle_t"]                 = {"hipblasHandle_t", CONV_TYPE, API_BLAS};
     // Blas operations
-    cuda2hipRename["cublasOperation_t"]              = {"hipblasOperation_t", CONV_BLAS};
-    cuda2hipRename["CUBLAS_OP_N"]                    = {"HIPBLAS_OP_N", CONV_BLAS};
-    cuda2hipRename["CUBLAS_OP_T"]                    = {"HIPBLAS_OP_T", CONV_BLAS};
-    cuda2hipRename["CUBLAS_OP_C"]                    = {"HIPBLAS_OP_C", CONV_BLAS};
+    cuda2hipRename["cublasOperation_t"]              = {"hipblasOperation_t", CONV_TYPE, API_BLAS};
+    cuda2hipRename["CUBLAS_OP_N"]                    = {"HIPBLAS_OP_N", CONV_NUMERIC_LITERAL, API_BLAS};
+    cuda2hipRename["CUBLAS_OP_T"]                    = {"HIPBLAS_OP_T", CONV_NUMERIC_LITERAL, API_BLAS};
+    cuda2hipRename["CUBLAS_OP_C"]                    = {"HIPBLAS_OP_C", CONV_NUMERIC_LITERAL, API_BLAS};
     // Blas statuses
-    cuda2hipRename["cublasStatus_t"]                 = {"hipblasStatus_t", CONV_BLAS};
-    cuda2hipRename["CUBLAS_STATUS_SUCCESS"]          = {"HIPBLAS_STATUS_SUCCESS", CONV_BLAS};
-    cuda2hipRename["CUBLAS_STATUS_NOT_INITIALIZED"]  = {"HIPBLAS_STATUS_NOT_INITIALIZED", CONV_BLAS};
-    cuda2hipRename["CUBLAS_STATUS_ALLOC_FAILED"]     = {"HIPBLAS_STATUS_ALLOC_FAILED", CONV_BLAS};
-    cuda2hipRename["CUBLAS_STATUS_INVALID_VALUE"]    = {"HIPBLAS_STATUS_INVALID_VALUE", CONV_BLAS};
-    cuda2hipRename["CUBLAS_STATUS_MAPPING_ERROR"]    = {"HIPBLAS_STATUS_MAPPING_ERROR", CONV_BLAS};
-    cuda2hipRename["CUBLAS_STATUS_EXECUTION_FAILED"] = {"HIPBLAS_STATUS_EXECUTION_FAILED", CONV_BLAS};
-    cuda2hipRename["CUBLAS_STATUS_INTERNAL_ERROR"]   = {"HIPBLAS_STATUS_INTERNAL_ERROR", CONV_BLAS};
-    cuda2hipRename["CUBLAS_STATUS_NOT_SUPPORTED"]    = {"HIPBLAS_STATUS_INTERNAL_ERROR", CONV_BLAS};
+    cuda2hipRename["cublasStatus_t"]                 = {"hipblasStatus_t", CONV_TYPE, API_BLAS};
+    cuda2hipRename["CUBLAS_STATUS_SUCCESS"]          = {"HIPBLAS_STATUS_SUCCESS", CONV_NUMERIC_LITERAL, API_BLAS};
+    cuda2hipRename["CUBLAS_STATUS_NOT_INITIALIZED"]  = {"HIPBLAS_STATUS_NOT_INITIALIZED", CONV_NUMERIC_LITERAL, API_BLAS};
+    cuda2hipRename["CUBLAS_STATUS_ALLOC_FAILED"]     = {"HIPBLAS_STATUS_ALLOC_FAILED", CONV_NUMERIC_LITERAL, API_BLAS};
+    cuda2hipRename["CUBLAS_STATUS_INVALID_VALUE"]    = {"HIPBLAS_STATUS_INVALID_VALUE", CONV_NUMERIC_LITERAL, API_BLAS};
+    cuda2hipRename["CUBLAS_STATUS_MAPPING_ERROR"]    = {"HIPBLAS_STATUS_MAPPING_ERROR", CONV_NUMERIC_LITERAL, API_BLAS};
+    cuda2hipRename["CUBLAS_STATUS_EXECUTION_FAILED"] = {"HIPBLAS_STATUS_EXECUTION_FAILED", CONV_NUMERIC_LITERAL, API_BLAS};
+    cuda2hipRename["CUBLAS_STATUS_INTERNAL_ERROR"]   = {"HIPBLAS_STATUS_INTERNAL_ERROR", CONV_NUMERIC_LITERAL, API_BLAS};
+    cuda2hipRename["CUBLAS_STATUS_NOT_SUPPORTED"]    = {"HIPBLAS_STATUS_INTERNAL_ERROR", CONV_NUMERIC_LITERAL, API_BLAS};
     // Blas Fill Modes
     // unsupported yet by hipblas/hcblas
-    //cuda2hipRename["cublasFillMode_t"]               = {"hipblasFillMode_t", CONV_BLAS};
-    //cuda2hipRename["CUBLAS_FILL_MODE_LOWER"]         = {"HIPBLAS_FILL_MODE_LOWER", CONV_BLAS};
-    //cuda2hipRename["CUBLAS_FILL_MODE_UPPER"]         = {"HIPBLAS_FILL_MODE_UPPER", CONV_BLAS};
+    //cuda2hipRename["cublasFillMode_t"]               = {"hipblasFillMode_t", CONV_TYPE, API_BLAS};
+    //cuda2hipRename["CUBLAS_FILL_MODE_LOWER"]         = {"HIPBLAS_FILL_MODE_LOWER", CONV_NUMERIC_LITERAL, API_BLAS};
+    //cuda2hipRename["CUBLAS_FILL_MODE_UPPER"]         = {"HIPBLAS_FILL_MODE_UPPER", CONV_NUMERIC_LITERAL, API_BLAS};
     // Blas Diag Types
     // unsupported yet by hipblas/hcblas
-    //cuda2hipRename["cublasDiagType_t"]               = {"hipblasDiagType_t", CONV_BLAS};
-    //cuda2hipRename["CUBLAS_DIAG_NON_UNIT"]           = {"HIPBLAS_DIAG_NON_UNIT", CONV_BLAS};
-    //cuda2hipRename["CUBLAS_DIAG_UNIT"]               = {"HIPBLAS_DIAG_UNIT", CONV_BLAS};
+    //cuda2hipRename["cublasDiagType_t"]               = {"hipblasDiagType_t", CONV_TYPE, API_BLAS};
+    //cuda2hipRename["CUBLAS_DIAG_NON_UNIT"]           = {"HIPBLAS_DIAG_NON_UNIT", CONV_NUMERIC_LITERAL, API_BLAS};
+    //cuda2hipRename["CUBLAS_DIAG_UNIT"]               = {"HIPBLAS_DIAG_UNIT", CONV_NUMERIC_LITERAL, API_BLAS};
     // Blas Side Modes
     // unsupported yet by hipblas/hcblas
-    //cuda2hipRename["cublasSideMode_t"]               = {"hipblasSideMode_t", CONV_BLAS};
-    //cuda2hipRename["CUBLAS_SIDE_LEFT"]               = {"HIPBLAS_SIDE_LEFT", CONV_BLAS};
-    //cuda2hipRename["CUBLAS_SIDE_RIGHT"]              = {"HIPBLAS_SIDE_RIGHT", CONV_BLAS};
+    //cuda2hipRename["cublasSideMode_t"]               = {"hipblasSideMode_t", CONV_TYPE, API_BLAS};
+    //cuda2hipRename["CUBLAS_SIDE_LEFT"]               = {"HIPBLAS_SIDE_LEFT", CONV_NUMERIC_LITERAL, API_BLAS};
+    //cuda2hipRename["CUBLAS_SIDE_RIGHT"]              = {"HIPBLAS_SIDE_RIGHT", CONV_NUMERIC_LITERAL, API_BLAS};
     // Blas Pointer Modes
     // unsupported yet by hipblas/hcblas
-    //cuda2hipRename["cublasPointerMode_t"]            = {"hipblasPointerMode_t", CONV_BLAS};
-    //cuda2hipRename["CUBLAS_POINTER_MODE_HOST"]       = {"HIPBLAS_POINTER_MODE_HOST", CONV_BLAS};
-    //cuda2hipRename["CUBLAS_POINTER_MODE_DEVICE"]     = {"HIPBLAS_POINTER_MODE_DEVICE", CONV_BLAS};
+    //cuda2hipRename["cublasPointerMode_t"]            = {"hipblasPointerMode_t", CONV_TYPE, API_BLAS};
+    //cuda2hipRename["CUBLAS_POINTER_MODE_HOST"]       = {"HIPBLAS_POINTER_MODE_HOST", CONV_NUMERIC_LITERAL, API_BLAS};
+    //cuda2hipRename["CUBLAS_POINTER_MODE_DEVICE"]     = {"HIPBLAS_POINTER_MODE_DEVICE", CONV_NUMERIC_LITERAL, API_BLAS};
     // Blas Atomics Modes
     // unsupported yet by hipblas/hcblas
-    //cuda2hipRename["cublasAtomicsMode_t"]            = {"hipblasAtomicsMode_t", CONV_BLAS};
-    //cuda2hipRename["CUBLAS_ATOMICS_NOT_ALLOWED"]     = {"HIPBLAS_ATOMICS_NOT_ALLOWED", CONV_BLAS};
-    //cuda2hipRename["CUBLAS_ATOMICS_ALLOWED"]         = {"HIPBLAS_ATOMICS_ALLOWED", CONV_BLAS};
+    //cuda2hipRename["cublasAtomicsMode_t"]            = {"hipblasAtomicsMode_t", CONV_TYPE, API_BLAS};
+    //cuda2hipRename["CUBLAS_ATOMICS_NOT_ALLOWED"]     = {"HIPBLAS_ATOMICS_NOT_ALLOWED", CONV_NUMERIC_LITERAL, API_BLAS};
+    //cuda2hipRename["CUBLAS_ATOMICS_ALLOWED"]         = {"HIPBLAS_ATOMICS_ALLOWED", CONV_NUMERIC_LITERAL, API_BLAS};
     // Blas Data Type
     // unsupported yet by hipblas/hcblas
-    //cuda2hipRename["cublasDataType_t"]               = {"hipblasDataType_t", CONV_BLAS};
-    //cuda2hipRename["CUBLAS_DATA_FLOAT"]              = {"HIPBLAS_DATA_FLOAT", CONV_BLAS};
-    //cuda2hipRename["CUBLAS_DATA_DOUBLE"]             = {"HIPBLAS_DATA_DOUBLE", CONV_BLAS};
-    //cuda2hipRename["CUBLAS_DATA_HALF"]               = {"HIPBLAS_DATA_HALF", CONV_BLAS};
-    //cuda2hipRename["CUBLAS_DATA_INT8"]               = {"HIPBLAS_DATA_INT8", CONV_BLAS};
+    //cuda2hipRename["cublasDataType_t"]               = {"hipblasDataType_t", CONV_TYPE, API_BLAS};
+    //cuda2hipRename["CUBLAS_DATA_FLOAT"]              = {"HIPBLAS_DATA_FLOAT", CONV_NUMERIC_LITERAL, API_BLAS};
+    //cuda2hipRename["CUBLAS_DATA_DOUBLE"]             = {"HIPBLAS_DATA_DOUBLE", CONV_NUMERIC_LITERAL, API_BLAS};
+    //cuda2hipRename["CUBLAS_DATA_HALF"]               = {"HIPBLAS_DATA_HALF", CONV_NUMERIC_LITERAL, API_BLAS};
+    //cuda2hipRename["CUBLAS_DATA_INT8"]               = {"HIPBLAS_DATA_INT8", CONV_NUMERIC_LITERAL, API_BLAS};
 
     // Blas1 (v1) Routines
-    cuda2hipRename["cublasCreate"]       = {"hipblasCreate", CONV_BLAS};
-    cuda2hipRename["cublasDestroy"]      = {"hipblasDestroy", CONV_BLAS};
+    cuda2hipRename["cublasCreate"]       = {"hipblasCreate", CONV_MATH_FUNC, API_BLAS};
+    cuda2hipRename["cublasDestroy"]      = {"hipblasDestroy", CONV_MATH_FUNC, API_BLAS};
 
-    cuda2hipRename["cublasSetVector"]    = {"hipblasSetVector", CONV_BLAS};
-    cuda2hipRename["cublasGetVector"]    = {"hipblasGetVector", CONV_BLAS};
-    cuda2hipRename["cublasSetMatrix"]    = {"hipblasSetMatrix", CONV_BLAS};
-    cuda2hipRename["cublasGetMatrix"]    = {"hipblasGetMatrix", CONV_BLAS};
+    cuda2hipRename["cublasSetVector"]    = {"hipblasSetVector", CONV_MATH_FUNC, API_BLAS};
+    cuda2hipRename["cublasGetVector"]    = {"hipblasGetVector", CONV_MATH_FUNC, API_BLAS};
+    cuda2hipRename["cublasSetMatrix"]    = {"hipblasSetMatrix", CONV_MATH_FUNC, API_BLAS};
+    cuda2hipRename["cublasGetMatrix"]    = {"hipblasGetMatrix", CONV_MATH_FUNC, API_BLAS};
 
     // unsupported yet by hipblas/hcblas
-    //cuda2hipRename["cublasGetMatrixAsync"] = {"hipblasGetMatrixAsync", CONV_BLAS};
-    //cuda2hipRename["cublasSetMatrixAsync"] = {"hipblasSetMatrixAsync", CONV_BLAS};
+    //cuda2hipRename["cublasGetMatrixAsync"] = {"hipblasGetMatrixAsync", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasSetMatrixAsync"] = {"hipblasSetMatrixAsync", CONV_MATH_FUNC, API_BLAS};
 
     // NRM2
-    //cuda2hipRename["cublasSnrm2"]  = {"hipblasSnrm2", CONV_BLAS};
-    //cuda2hipRename["cublasDnrm2"]  = {"hipblasDnrm2", CONV_BLAS};
-    //cuda2hipRename["cublasScnrm2"] = {"hipblasScnrm2", CONV_BLAS};
-    //cuda2hipRename["cublasDznrm2"] = {"hipblasDznrm2", CONV_BLAS};
+    //cuda2hipRename["cublasSnrm2"]  = {"hipblasSnrm2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDnrm2"]  = {"hipblasDnrm2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasScnrm2"] = {"hipblasScnrm2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDznrm2"] = {"hipblasDznrm2", CONV_MATH_FUNC, API_BLAS};
 
     // DOT
-    cuda2hipRename["cublasSdot"]        = {"hipblasSdot", CONV_BLAS};
+    cuda2hipRename["cublasSdot"]        = {"hipblasSdot", CONV_MATH_FUNC, API_BLAS};
     // there is no such a function in CUDA
-    cuda2hipRename["cublasSdotBatched"] = {"hipblasSdotBatched",CONV_BLAS};
-    cuda2hipRename["cublasDdot"]        = {"hipblasDdot", CONV_BLAS};
+    cuda2hipRename["cublasSdotBatched"] = {"hipblasSdotBatched",CONV_MATH_FUNC, API_BLAS};
+    cuda2hipRename["cublasDdot"]        = {"hipblasDdot", CONV_MATH_FUNC, API_BLAS};
     // there is no such a function in CUDA
-    cuda2hipRename["cublasDdotBatched"] = {"hipblasDdotBatched", CONV_BLAS};
-    //cuda2hipRename["cublasCdotu"] = {"hipblasCdotu", CONV_BLAS};
-    //cuda2hipRename["cublasCdotc"] = {"hipblasCdotc", CONV_BLAS};
-    //cuda2hipRename["cublasZdotu"] = {"hipblasZdotu", CONV_BLAS};
-    //cuda2hipRename["cublasZdotc"] = {"hipblasZdotc", CONV_BLAS};
+    cuda2hipRename["cublasDdotBatched"] = {"hipblasDdotBatched", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCdotu"] = {"hipblasCdotu", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCdotc"] = {"hipblasCdotc", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZdotu"] = {"hipblasZdotu", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZdotc"] = {"hipblasZdotc", CONV_MATH_FUNC, API_BLAS};
 
     // SCAL
-    cuda2hipRename["cublasSscal"] = {"hipblasSscal", CONV_BLAS};
+    cuda2hipRename["cublasSscal"] = {"hipblasSscal", CONV_MATH_FUNC, API_BLAS};
     // there is no such a function in CUDA
-    cuda2hipRename["cublasSscalBatched"] = {"hipblasSscalBatched", CONV_BLAS};
-    cuda2hipRename["cublasDscal"] = {"hipblasDscal", CONV_BLAS};
+    cuda2hipRename["cublasSscalBatched"] = {"hipblasSscalBatched", CONV_MATH_FUNC, API_BLAS};
+    cuda2hipRename["cublasDscal"] = {"hipblasDscal", CONV_MATH_FUNC, API_BLAS};
     // there is no such a function in CUDA
-    cuda2hipRename["cublasDscalBatched"] = {"hipblasDscalBatched", CONV_BLAS};
-    //cuda2hipRename["cublasCscal"]  = {"hipblasCscal", CONV_BLAS};
-    //cuda2hipRename["cublasCsscal"] = {"hipblasCsscal", CONV_BLAS};
-    //cuda2hipRename["cublasZscal"]  = {"hipblasZscal", CONV_BLAS};
-    //cuda2hipRename["cublasZdscal"] = {"hipblasZdscal", CONV_BLAS};
+    cuda2hipRename["cublasDscalBatched"] = {"hipblasDscalBatched", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCscal"]  = {"hipblasCscal", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCsscal"] = {"hipblasCsscal", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZscal"]  = {"hipblasZscal", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZdscal"] = {"hipblasZdscal", CONV_MATH_FUNC, API_BLAS};
 
     // AXPY
-    cuda2hipRename["cublasSaxpy"] = {"hipblasSaxpy", CONV_BLAS};
+    cuda2hipRename["cublasSaxpy"] = {"hipblasSaxpy", CONV_MATH_FUNC, API_BLAS};
     // there is no such a function in CUDA
-    cuda2hipRename["cublasSaxpyBatched"] = {"hipblasSaxpyBatched", CONV_BLAS};
-    //cuda2hipRename["cublasDaxpy"] = {"hipblasDaxpy", CONV_BLAS};
-    //cuda2hipRename["cublasCaxpy"] = {"hipblasCaxpy", CONV_BLAS};
-    //cuda2hipRename["cublasZaxpy"] = {"hipblasZaxpy", CONV_BLAS};
+    cuda2hipRename["cublasSaxpyBatched"] = {"hipblasSaxpyBatched", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDaxpy"] = {"hipblasDaxpy", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCaxpy"] = {"hipblasCaxpy", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZaxpy"] = {"hipblasZaxpy", CONV_MATH_FUNC, API_BLAS};
 
     // COPY
-    cuda2hipRename["cublasScopy"] = {"hipblasScopy", CONV_BLAS};
+    cuda2hipRename["cublasScopy"] = {"hipblasScopy", CONV_MATH_FUNC, API_BLAS};
     // there is no such a function in CUDA
-    cuda2hipRename["cublasScopyBatched"] = {"hipblasScopyBatched", CONV_BLAS};
-    cuda2hipRename["cublasDcopy"] = {"hipblasDcopy", CONV_BLAS};
+    cuda2hipRename["cublasScopyBatched"] = {"hipblasScopyBatched", CONV_MATH_FUNC, API_BLAS};
+    cuda2hipRename["cublasDcopy"] = {"hipblasDcopy", CONV_MATH_FUNC, API_BLAS};
     // there is no such a function in CUDA
-    cuda2hipRename["cublasDcopyBatched"] = {"hipblasDcopyBatched", CONV_BLAS};
-    //cuda2hipRename["cublasCcopy"] = {"hipblasCcopy", CONV_BLAS};
-    //cuda2hipRename["cublasZcopy"] = {"hipblasZcopy", CONV_BLAS};
+    cuda2hipRename["cublasDcopyBatched"] = {"hipblasDcopyBatched", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCcopy"] = {"hipblasCcopy", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZcopy"] = {"hipblasZcopy", CONV_MATH_FUNC, API_BLAS};
 
     // SWAP
-    //cuda2hipRename["cublasSswap"] = {"hipblasSswap", CONV_BLAS};
-    //cuda2hipRename["cublasDswap"] = {"hipblasDswap", CONV_BLAS};
-    //cuda2hipRename["cublasCswap"] = {"hipblasCswap", CONV_BLAS};
-    //cuda2hipRename["cublasZswap"] = {"hipblasZswap", CONV_BLAS};
+    //cuda2hipRename["cublasSswap"] = {"hipblasSswap", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDswap"] = {"hipblasDswap", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCswap"] = {"hipblasCswap", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZswap"] = {"hipblasZswap", CONV_MATH_FUNC, API_BLAS};
 
     // AMAX
-    //cuda2hipRename["cublasIsamax"] = {"hipblasIsamax", CONV_BLAS};
-    //cuda2hipRename["cublasIdamax"] = {"hipblasIdamax", CONV_BLAS};
-    //cuda2hipRename["cublasIcamax"] = {"hipblasIcamax", CONV_BLAS};
-    //cuda2hipRename["cublasIzamax"] = {"hipblasIzamax", CONV_BLAS};
+    //cuda2hipRename["cublasIsamax"] = {"hipblasIsamax", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasIdamax"] = {"hipblasIdamax", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasIcamax"] = {"hipblasIcamax", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasIzamax"] = {"hipblasIzamax", CONV_MATH_FUNC, API_BLAS};
 
     // AMIN
-    //cuda2hipRename["cublasIsamin"] = {"hipblasIsamin", CONV_BLAS};
-    //cuda2hipRename["cublasIdamin"] = {"hipblasIdamin", CONV_BLAS};
-    //cuda2hipRename["cublasIcamin"] = {"hipblasIcamin", CONV_BLAS};
-    //cuda2hipRename["cublasIzamin"] = {"hipblasIzamin", CONV_BLAS};
+    //cuda2hipRename["cublasIsamin"] = {"hipblasIsamin", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasIdamin"] = {"hipblasIdamin", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasIcamin"] = {"hipblasIcamin", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasIzamin"] = {"hipblasIzamin", CONV_MATH_FUNC, API_BLAS};
 
     // ASUM
-    cuda2hipRename["cublasSasum"]        = {"hipblasSasum", CONV_BLAS};
+    cuda2hipRename["cublasSasum"]        = {"hipblasSasum", CONV_MATH_FUNC, API_BLAS};
     // there is no such a function in CUDA
-    cuda2hipRename["cublasSasumBatched"] = {"hipblasSasumBatched", CONV_BLAS};
-    cuda2hipRename["cublasDasum"]        = {"hipblasDasum", CONV_BLAS};
+    cuda2hipRename["cublasSasumBatched"] = {"hipblasSasumBatched", CONV_MATH_FUNC, API_BLAS};
+    cuda2hipRename["cublasDasum"]        = {"hipblasDasum", CONV_MATH_FUNC, API_BLAS};
     // there is no such a function in CUDA
-    cuda2hipRename["cublasDasumBatched"] = {"hipblasDasumBatched", CONV_BLAS};
-    //cuda2hipRename["cublasScasum"] = {"hipblasScasum", CONV_BLAS};
-    //cuda2hipRename["cublasDzasum"] = {"hipblasDzasum", CONV_BLAS};
+    cuda2hipRename["cublasDasumBatched"] = {"hipblasDasumBatched", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasScasum"] = {"hipblasScasum", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDzasum"] = {"hipblasDzasum", CONV_MATH_FUNC, API_BLAS};
 
     // ROT
-    //cuda2hipRename["cublasSrot"]  = {"hipblasSrot", CONV_BLAS};
-    //cuda2hipRename["cublasDrot"]  = {"hipblasDrot", CONV_BLAS};
-    //cuda2hipRename["cublasCrot"]  = {"hipblasCrot", CONV_BLAS};
-    //cuda2hipRename["cublasCsrot"] = {"hipblasCsrot", CONV_BLAS};
-    //cuda2hipRename["cublasZrot"]  = {"hipblasZrot", CONV_BLAS};
-    //cuda2hipRename["cublasZdrot"] = {"hipblasZdrot", CONV_BLAS};
+    //cuda2hipRename["cublasSrot"]  = {"hipblasSrot", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDrot"]  = {"hipblasDrot", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCrot"]  = {"hipblasCrot", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCsrot"] = {"hipblasCsrot", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZrot"]  = {"hipblasZrot", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZdrot"] = {"hipblasZdrot", CONV_MATH_FUNC, API_BLAS};
 
     // ROTG
-    //cuda2hipRename["cublasSrotg"] = {"hipblasSrotg", CONV_BLAS};
-    //cuda2hipRename["cublasDrotg"] = {"hipblasDrotg", CONV_BLAS};
-    //cuda2hipRename["cublasCrotg"] = {"hipblasCrotg", CONV_BLAS};
-    //cuda2hipRename["cublasZrotg"] = {"hipblasZrotg", CONV_BLAS};
+    //cuda2hipRename["cublasSrotg"] = {"hipblasSrotg", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDrotg"] = {"hipblasDrotg", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCrotg"] = {"hipblasCrotg", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZrotg"] = {"hipblasZrotg", CONV_MATH_FUNC, API_BLAS};
 
     // ROTM
-    //cuda2hipRename["cublasSrotm"] = {"hipblasSrotm", CONV_BLAS};
-    //cuda2hipRename["cublasDrotm"] = {"hipblasDrotm", CONV_BLAS};
+    //cuda2hipRename["cublasSrotm"] = {"hipblasSrotm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDrotm"] = {"hipblasDrotm", CONV_MATH_FUNC, API_BLAS};
 
     // ROTMG
-    //cuda2hipRename["cublasSrotmg"] = {"hipblasSrotmg", CONV_BLAS};
-    //cuda2hipRename["cublasDrotmg"] = {"hipblasDrotmg", CONV_BLAS};
+    //cuda2hipRename["cublasSrotmg"] = {"hipblasSrotmg", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDrotmg"] = {"hipblasDrotmg", CONV_MATH_FUNC, API_BLAS};
 
     // GEMV
-    cuda2hipRename["cublasSgemv"] = {"hipblasSgemv", CONV_BLAS};
+    cuda2hipRename["cublasSgemv"] = {"hipblasSgemv", CONV_MATH_FUNC, API_BLAS};
     // there is no such a function in CUDA
-    cuda2hipRename["cublasSgemvBatched"] = {"hipblasSgemvBatched", CONV_BLAS};
-    //cuda2hipRename["cublasDgemv"] = {"hipblasDgemv", CONV_BLAS};
-    //cuda2hipRename["cublasCgemv"] = {"hipblasCgemv", CONV_BLAS};
-    //cuda2hipRename["cublasZgemv"] = {"hipblasZgemv", CONV_BLAS};
+    cuda2hipRename["cublasSgemvBatched"] = {"hipblasSgemvBatched", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDgemv"] = {"hipblasDgemv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCgemv"] = {"hipblasCgemv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZgemv"] = {"hipblasZgemv", CONV_MATH_FUNC, API_BLAS};
 
     // GBMV
-    //cuda2hipRename["cublasSgbmv"] = {"hipblasSgbmv", CONV_BLAS};
-    //cuda2hipRename["cublasDgbmv"] = {"hipblasDgbmv", CONV_BLAS};
-    //cuda2hipRename["cublasCgbmv"] = {"hipblasCgbmv", CONV_BLAS};
-    //cuda2hipRename["cublasZgbmv"] = {"hipblasZgbmv", CONV_BLAS};
+    //cuda2hipRename["cublasSgbmv"] = {"hipblasSgbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDgbmv"] = {"hipblasDgbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCgbmv"] = {"hipblasCgbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZgbmv"] = {"hipblasZgbmv", CONV_MATH_FUNC, API_BLAS};
 
     // TRMV
-    //cuda2hipRename["cublasStrmv"] = {"hipblasStrmv", CONV_BLAS};
-    //cuda2hipRename["cublasDtrmv"] = {"hipblasDtrmv", CONV_BLAS};
-    //cuda2hipRename["cublasCtrmv"] = {"hipblasCtrmv", CONV_BLAS};
-    //cuda2hipRename["cublasZtrmv"] = {"hipblasZtrmv", CONV_BLAS};
+    //cuda2hipRename["cublasStrmv"] = {"hipblasStrmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDtrmv"] = {"hipblasDtrmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCtrmv"] = {"hipblasCtrmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZtrmv"] = {"hipblasZtrmv", CONV_MATH_FUNC, API_BLAS};
 
     // TBMV
-    //cuda2hipRename["cublasStbmv"] = {"hipblasStbmv", CONV_BLAS};
-    //cuda2hipRename["cublasDtbmv"] = {"hipblasDtbmv", CONV_BLAS};
-    //cuda2hipRename["cublasCtbmv"] = {"hipblasCtbmv", CONV_BLAS};
-    //cuda2hipRename["cublasZtbmv"] = {"hipblasZtbmv", CONV_BLAS};
+    //cuda2hipRename["cublasStbmv"] = {"hipblasStbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDtbmv"] = {"hipblasDtbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCtbmv"] = {"hipblasCtbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZtbmv"] = {"hipblasZtbmv", CONV_MATH_FUNC, API_BLAS};
 
     // TPMV
-    //cuda2hipRename["cublasStpmv"] = {"hipblasStpmv", CONV_BLAS};
-    //cuda2hipRename["cublasDtpmv"] = {"hipblasDtpmv", CONV_BLAS};
-    //cuda2hipRename["cublasCtpmv"] = {"hipblasCtpmv", CONV_BLAS};
-    //cuda2hipRename["cublasZtpmv"] = {"hipblasZtpmv", CONV_BLAS};
+    //cuda2hipRename["cublasStpmv"] = {"hipblasStpmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDtpmv"] = {"hipblasDtpmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCtpmv"] = {"hipblasCtpmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZtpmv"] = {"hipblasZtpmv", CONV_MATH_FUNC, API_BLAS};
 
     // TRSV
-    //cuda2hipRename["cublasStrsv"] = {"hipblasStrsv", CONV_BLAS};
-    //cuda2hipRename["cublasDtrsv"] = {"hipblasDtrsv", CONV_BLAS};
-    //cuda2hipRename["cublasCtrsv"] = {"hipblasCtrsv", CONV_BLAS};
-    //cuda2hipRename["cublasZtrsv"] = {"hipblasZtrsv", CONV_BLAS};
+    //cuda2hipRename["cublasStrsv"] = {"hipblasStrsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDtrsv"] = {"hipblasDtrsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCtrsv"] = {"hipblasCtrsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZtrsv"] = {"hipblasZtrsv", CONV_MATH_FUNC, API_BLAS};
 
     // TPSV
-    //cuda2hipRename["cublasStpsv"] = {"hipblasStpsv", CONV_BLAS};
-    //cuda2hipRename["cublasDtpsv"] = {"hipblasDtpsv", CONV_BLAS};
-    //cuda2hipRename["cublasCtpsv"] = {"hipblasCtpsv", CONV_BLAS};
-    //cuda2hipRename["cublasZtpsv"] = {"hipblasZtpsv", CONV_BLAS};
+    //cuda2hipRename["cublasStpsv"] = {"hipblasStpsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDtpsv"] = {"hipblasDtpsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCtpsv"] = {"hipblasCtpsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZtpsv"] = {"hipblasZtpsv", CONV_MATH_FUNC, API_BLAS};
 
     // TBSV
-    //cuda2hipRename["cublasStbsv"] = {"hipblasStbsv", CONV_BLAS};
-    //cuda2hipRename["cublasDtbsv"] = {"hipblasDtbsv", CONV_BLAS};
-    //cuda2hipRename["cublasCtbsv"] = {"hipblasCtbsv", CONV_BLAS};
-    //cuda2hipRename["cublasZtbsv"] = {"hipblasZtbsv", CONV_BLAS};
+    //cuda2hipRename["cublasStbsv"] = {"hipblasStbsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDtbsv"] = {"hipblasDtbsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCtbsv"] = {"hipblasCtbsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZtbsv"] = {"hipblasZtbsv", CONV_MATH_FUNC, API_BLAS};
 
     // SYMV/HEMV
-    //cuda2hipRename["cublasSsymv"] = {"hipblasSsymv", CONV_BLAS};
-    //cuda2hipRename["cublasDsymv"] = {"hipblasDsymv", CONV_BLAS};
-    //cuda2hipRename["cublasCsymv"] = {"hipblasCsymv", CONV_BLAS};
-    //cuda2hipRename["cublasZsymv"] = {"hipblasZsymv", CONV_BLAS};
-    //cuda2hipRename["cublasChemv"] = {"hipblasChemv", CONV_BLAS};
-    //cuda2hipRename["cublasZhemv"] = {"hipblasZhemv", CONV_BLAS};
+    //cuda2hipRename["cublasSsymv"] = {"hipblasSsymv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDsymv"] = {"hipblasDsymv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCsymv"] = {"hipblasCsymv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZsymv"] = {"hipblasZsymv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasChemv"] = {"hipblasChemv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZhemv"] = {"hipblasZhemv", CONV_MATH_FUNC, API_BLAS};
 
     // SBMV/HBMV
-    //cuda2hipRename["cublasSsbmv"] = {"hipblasSsbmv", CONV_BLAS};
-    //cuda2hipRename["cublasDsbmv"] = {"hpiblasDsbmv", CONV_BLAS};
-    //cuda2hipRename["cublasChbmv"] = {"hipblasChbmv", CONV_BLAS};
-    //cuda2hipRename["cublasZhbmv"] = {"hipblasZhbmv", CONV_BLAS};
+    //cuda2hipRename["cublasSsbmv"] = {"hipblasSsbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDsbmv"] = {"hpiblasDsbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasChbmv"] = {"hipblasChbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZhbmv"] = {"hipblasZhbmv", CONV_MATH_FUNC, API_BLAS};
 
     // SPMV/HPMV
-    //cuda2hipRename["cublasSspmv"] = {"hipblasSspmv", CONV_BLAS};
-    //cuda2hipRename["cublasDspmv"] = {"hipblasDspmv", CONV_BLAS};
-    //cuda2hipRename["cublasChpmv"] = {"hipblasChpmv", CONV_BLAS};
-    //cuda2hipRename["cublasZhpmv"] = {"hipblasZhpmv", CONV_BLAS};
+    //cuda2hipRename["cublasSspmv"] = {"hipblasSspmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDspmv"] = {"hipblasDspmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasChpmv"] = {"hipblasChpmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZhpmv"] = {"hipblasZhpmv", CONV_MATH_FUNC, API_BLAS};
 
     // GER
-    cuda2hipRename["cublasSger"] = {"hipblasSger", CONV_BLAS};
-    //cuda2hipRename["cublasDger"]  = {"hipblasDger", CONV_BLAS};
-    //cuda2hipRename["cublasCgeru"] = {"hipblasCgeru", CONV_BLAS};
-    //cuda2hipRename["cublasCgerc"] = {"hipblasCgerc", CONV_BLAS};
-    //cuda2hipRename["cublasZgeru"] = {"hipblasZgeru", CONV_BLAS};
-    //cuda2hipRename["cublasZgerc"] = {"hipblasZgerc", CONV_BLAS};
+    cuda2hipRename["cublasSger"] = {"hipblasSger", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDger"]  = {"hipblasDger", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCgeru"] = {"hipblasCgeru", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCgerc"] = {"hipblasCgerc", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZgeru"] = {"hipblasZgeru", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZgerc"] = {"hipblasZgerc", CONV_MATH_FUNC, API_BLAS};
 
     // SYR/HER
-    //cuda2hipRename["cublasSsyr"] = {"hipblasSsyr", CONV_BLAS};
-    //cuda2hipRename["cublasDsyr"] = {"hipblasDsyr", CONV_BLAS};
-    //cuda2hipRename["cublasCher"] = {"hipblasCher", CONV_BLAS};
-    //cuda2hipRename["cublasZher"] = {"hipblasZher", CONV_BLAS};
+    //cuda2hipRename["cublasSsyr"] = {"hipblasSsyr", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDsyr"] = {"hipblasDsyr", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCher"] = {"hipblasCher", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZher"] = {"hipblasZher", CONV_MATH_FUNC, API_BLAS};
 
     // SPR/HPR
-    //cuda2hipRename["cublasSspr"] = {"hipblasSspr", CONV_BLAS};
-    //cuda2hipRename["cublasDspr"] = {"hipblasDspr", CONV_BLAS};
-    //cuda2hipRename["cublasChpr"] = {"hipblasChpr", CONV_BLAS};
-    //cuda2hipRename["cublasZhpr"] = {"hipblasZhpr", CONV_BLAS};
+    //cuda2hipRename["cublasSspr"] = {"hipblasSspr", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDspr"] = {"hipblasDspr", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasChpr"] = {"hipblasChpr", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZhpr"] = {"hipblasZhpr", CONV_MATH_FUNC, API_BLAS};
 
     // SYR2/HER2
-    //cuda2hipRename["cublasSsyr2"] = {"hipblasSsyr2", CONV_BLAS};
-    //cuda2hipRename["cublasDsyr2"] = {"hipblasDsyr2", CONV_BLAS};
-    //cuda2hipRename["cublasCher2"] = {"hipblasCher2", CONV_BLAS};
-    //cuda2hipRename["cublasZher2"] = {"hipblasZher2", CONV_BLAS};
+    //cuda2hipRename["cublasSsyr2"] = {"hipblasSsyr2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDsyr2"] = {"hipblasDsyr2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCher2"] = {"hipblasCher2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZher2"] = {"hipblasZher2", CONV_MATH_FUNC, API_BLAS};
 
     // SPR2/HPR2
-    //cuda2hipRename["cublasSspr2"] = {"hipblasSspr2", CONV_BLAS};
-    //cuda2hipRename["cublasDspr2"] = {"hipblasDspr2", CONV_BLAS};
-    //cuda2hipRename["cublasChpr2"] = {"hipblasChpr2", CONV_BLAS};
-    //cuda2hipRename["cublasZhpr2"] = {"hipblasZhpr2", CONV_BLAS};
+    //cuda2hipRename["cublasSspr2"] = {"hipblasSspr2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDspr2"] = {"hipblasDspr2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasChpr2"] = {"hipblasChpr2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZhpr2"] = {"hipblasZhpr2", CONV_MATH_FUNC, API_BLAS};
 
     // Blas3 (v1) Routines
     // GEMM
-    cuda2hipRename["cublasSgemm"] = {"hipblasSgemm", CONV_BLAS};
-    //cuda2hipRename["cublasDgemm"] = {"hipblasDgemm", CONV_BLAS};
-    cuda2hipRename["cublasCgemm"] = {"hipblasCgemm", CONV_BLAS};
-    //cuda2hipRename["cublasZgemm"] = {"hipblasZgemm", CONV_BLAS};
+    cuda2hipRename["cublasSgemm"] = {"hipblasSgemm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDgemm"] = {"hipblasDgemm", CONV_MATH_FUNC, API_BLAS};
+    cuda2hipRename["cublasCgemm"] = {"hipblasCgemm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZgemm"] = {"hipblasZgemm", CONV_MATH_FUNC, API_BLAS};
 
     // BATCH GEMM
-    cuda2hipRename["cublasSgemmBatched"] = {"hipblasSgemmBatched", CONV_BLAS};
-    //cuda2hipRename["cublasDgemmBatched"] = {"hipblasDgemmBatched", CONV_BLAS};
-    cuda2hipRename["cublasCgemmBatched"] = {"hipblasCgemmBatched", CONV_BLAS};
-    //cuda2hipRename["cublasZgemmBatched"] = {"hipblasZgemmBatched", CONV_BLAS};
+    cuda2hipRename["cublasSgemmBatched"] = {"hipblasSgemmBatched", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDgemmBatched"] = {"hipblasDgemmBatched", CONV_MATH_FUNC, API_BLAS};
+    cuda2hipRename["cublasCgemmBatched"] = {"hipblasCgemmBatched", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZgemmBatched"] = {"hipblasZgemmBatched", CONV_MATH_FUNC, API_BLAS};
 
     // SYRK
-    //cuda2hipRename["cublasSsyrk"] = {"hipblasSsyrk", CONV_BLAS};
-    //cuda2hipRename["cublasDsyrk"] = {"hipblasDsyrk", CONV_BLAS};
-    //cuda2hipRename["cublasCsyrk"] = {"hipblasCsyrk", CONV_BLAS};
-    //cuda2hipRename["cublasZsyrk"] = {"hipblasZsyrk", CONV_BLAS};
+    //cuda2hipRename["cublasSsyrk"] = {"hipblasSsyrk", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDsyrk"] = {"hipblasDsyrk", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCsyrk"] = {"hipblasCsyrk", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZsyrk"] = {"hipblasZsyrk", CONV_MATH_FUNC, API_BLAS};
 
     // HERK
-    //cuda2hipRename["cublasCherk"] = {"hipblasCherk", CONV_BLAS};
-    //cuda2hipRename["cublasZherk"] = {"hipblasZherk", CONV_BLAS};
+    //cuda2hipRename["cublasCherk"] = {"hipblasCherk", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZherk"] = {"hipblasZherk", CONV_MATH_FUNC, API_BLAS};
 
     // SYR2K
-    //cuda2hipRename["cublasSsyr2k"] = {"hipblasSsyr2k", CONV_BLAS};
-    //cuda2hipRename["cublasDsyr2k"] = {"hipblasDsyr2k", CONV_BLAS};
-    //cuda2hipRename["cublasCsyr2k"] = {"hipblasCsyr2k", CONV_BLAS};
-    //cuda2hipRename["cublasZsyr2k"] = {"hipblasZsyr2k", CONV_BLAS};
+    //cuda2hipRename["cublasSsyr2k"] = {"hipblasSsyr2k", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDsyr2k"] = {"hipblasDsyr2k", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCsyr2k"] = {"hipblasCsyr2k", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZsyr2k"] = {"hipblasZsyr2k", CONV_MATH_FUNC, API_BLAS};
 
     // SYRKX - eXtended SYRK
     // cublasSsyrkx
@@ -652,40 +663,40 @@ struct cuda2hipMap {
     // cublasZsyrkx
 
     // HER2K
-    //cuda2hipRename["cublasCher2k"] = {"hipblasCher2k", CONV_BLAS};
-    //cuda2hipRename["cublasZher2k"] = {"hipblasZher2k", CONV_BLAS};
+    //cuda2hipRename["cublasCher2k"] = {"hipblasCher2k", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZher2k"] = {"hipblasZher2k", CONV_MATH_FUNC, API_BLAS};
 
     // HERKX - eXtended HERK
     // cublasCherkx
     // cublasZherkx
 
     // SYMM
-    //cuda2hipRename["cublasSsymm"] = {"hipblasSsymm", CONV_BLAS};
-    //cuda2hipRename["cublasDsymm"] = {"hipblasDsymm", CONV_BLAS};
-    //cuda2hipRename["cublasCsymm"] = {"hipblasCsymm", CONV_BLAS};
-    //cuda2hipRename["cublasZsymm"] = {"hipblasZsymm", CONV_BLAS};
+    //cuda2hipRename["cublasSsymm"] = {"hipblasSsymm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDsymm"] = {"hipblasDsymm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCsymm"] = {"hipblasCsymm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZsymm"] = {"hipblasZsymm", CONV_MATH_FUNC, API_BLAS};
 
     // HEMM
-    //cuda2hipRename["cublasChemm"] = {"hipblasChemm", CONV_BLAS};
-    //cuda2hipRename["cublasZhemm"] = {"hipblasZhemm", CONV_BLAS};
+    //cuda2hipRename["cublasChemm"] = {"hipblasChemm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZhemm"] = {"hipblasZhemm", CONV_MATH_FUNC, API_BLAS};
 
     // TRSM
-    //cuda2hipRename["cublasStrsm"] = {"hipblasStrsm", CONV_BLAS};
-    //cuda2hipRename["cublasDtrsm"] = {"hipblasDtrsm", CONV_BLAS};
-    //cuda2hipRename["cublasCtrsm"] = {"hipblasCtrsm", CONV_BLAS};
-    //cuda2hipRename["cublasZtrsm"] = {"hipblasZtrsm", CONV_BLAS};
+    //cuda2hipRename["cublasStrsm"] = {"hipblasStrsm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDtrsm"] = {"hipblasDtrsm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCtrsm"] = {"hipblasCtrsm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZtrsm"] = {"hipblasZtrsm", CONV_MATH_FUNC, API_BLAS};
 
     // TRSM - Batched Triangular Solver
-    //cuda2hipRename["cublasStrsmBatched"] = {"hipblasStrsmBatched", CONV_BLAS};
-    //cuda2hipRename["cublasDtrsmBatched"] = {"hipblasDtrsmBatched", CONV_BLAS};
-    //cuda2hipRename["cublasCtrsmBatched"] = {"hipblasCtrsmBatched", CONV_BLAS};
-    //cuda2hipRename["cublasZtrsmBatched"] = {"hipblasZtrsmBatched", CONV_BLAS};
+    //cuda2hipRename["cublasStrsmBatched"] = {"hipblasStrsmBatched", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDtrsmBatched"] = {"hipblasDtrsmBatched", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCtrsmBatched"] = {"hipblasCtrsmBatched", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZtrsmBatched"] = {"hipblasZtrsmBatched", CONV_MATH_FUNC, API_BLAS};
 
     // TRMM
-    //cuda2hipRename["cublasStrmm"] = {"hipblasStrmm", CONV_BLAS};
-    //cuda2hipRename["cublasDtrmm"] = {"hipblasDtrmm", CONV_BLAS};
-    //cuda2hipRename["cublasCtrmm"] = {"hipblasCtrmm", CONV_BLAS};
-    //cuda2hipRename["cublasZtrmm"] = {"hipblasZtrmm", CONV_BLAS};
+    //cuda2hipRename["cublasStrmm"] = {"hipblasStrmm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDtrmm"] = {"hipblasDtrmm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCtrmm"] = {"hipblasCtrmm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZtrmm"] = {"hipblasZtrmm", CONV_MATH_FUNC, API_BLAS};
 
 
     // TO SUPPORT OR NOT? (cublas_api.h)
@@ -758,256 +769,257 @@ struct cuda2hipMap {
     // cublasZtrttp
 
     // Blas2 (v2) Routines
-    cuda2hipRename["cublasCreate_v2"] =  {"hipblasCreate", CONV_BLAS};
-    cuda2hipRename["cublasDestroy_v2"] = {"hipblasDestroy", CONV_BLAS};
+    cuda2hipRename["cublasCreate_v2"] =  {"hipblasCreate", CONV_MATH_FUNC, API_BLAS};
+    cuda2hipRename["cublasDestroy_v2"] = {"hipblasDestroy", CONV_MATH_FUNC, API_BLAS};
 
     // unsupported yet by hipblas/hcblas
-    //cuda2hipRename["cublasGetVersion_v2"]     = {"hipblasGetVersion", CONV_BLAS};
-    //cuda2hipRename["cublasSetStream_v2"]      = {"hipblasSetStream", CONV_BLAS};
-    //cuda2hipRename["cublasGetStream_v2"]      = {"hipblasGetStream", CONV_BLAS};
-    //cuda2hipRename["cublasGetPointerMode_v2"] = {"hipblasGetPointerMode", CONV_BLAS};
-    //cuda2hipRename["cublasSetPointerMode_v2"] = {"hipblasSetPointerMode", CONV_BLAS};
+    //cuda2hipRename["cublasGetVersion_v2"]     = {"hipblasGetVersion", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasSetStream_v2"]      = {"hipblasSetStream", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasGetStream_v2"]      = {"hipblasGetStream", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasGetPointerMode_v2"] = {"hipblasGetPointerMode", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasSetPointerMode_v2"] = {"hipblasSetPointerMode", CONV_MATH_FUNC, API_BLAS};
 
     // GEMV
-    cuda2hipRename["cublasSgemv_v2"] = {"hipblasSgemv", CONV_BLAS};
-    //cuda2hipRename["cublasDgemv_v2"] = {"hipblasDgemv", CONV_BLAS};
-    //cuda2hipRename["cublasCgemv_v2"] = {"hipblasCgemv", CONV_BLAS};
-    //cuda2hipRename["cublasZgemv_v2"] = {"hipblasZgemv", CONV_BLAS};
+    cuda2hipRename["cublasSgemv_v2"] = {"hipblasSgemv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDgemv_v2"] = {"hipblasDgemv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCgemv_v2"] = {"hipblasCgemv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZgemv_v2"] = {"hipblasZgemv", CONV_MATH_FUNC, API_BLAS};
 
     // GBMV
-    //cuda2hipRename["cublasSgbmv_v2"] = {"hipblasSgbmv", CONV_BLAS};
-    //cuda2hipRename["cublasDgbmv_v2"] = {"hipblasDgbmv", CONV_BLAS};
-    //cuda2hipRename["cublasCgbmv_v2"] = {"hipblasCgbmv", CONV_BLAS};
-    //cuda2hipRename["cublasZgbmv_v2"] = {"hipblasZgbmv", CONV_BLAS};
+    //cuda2hipRename["cublasSgbmv_v2"] = {"hipblasSgbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDgbmv_v2"] = {"hipblasDgbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCgbmv_v2"] = {"hipblasCgbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZgbmv_v2"] = {"hipblasZgbmv", CONV_MATH_FUNC, API_BLAS};
 
     // TRMV
-    //cuda2hipRename["cublasStrmv_v2"] = {"hipblasStrmv", CONV_BLAS};
-    //cuda2hipRename["cublasDtrmv_v2"] = {"hipblasDtrmv", CONV_BLAS};
-    //cuda2hipRename["cublasCtrmv_v2"] = {"hipblasCtrmv", CONV_BLAS};
-    //cuda2hipRename["cublasZtrmv_v2"] = {"hipblasZtrmv", CONV_BLAS};
+    //cuda2hipRename["cublasStrmv_v2"] = {"hipblasStrmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDtrmv_v2"] = {"hipblasDtrmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCtrmv_v2"] = {"hipblasCtrmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZtrmv_v2"] = {"hipblasZtrmv", CONV_MATH_FUNC, API_BLAS};
 
     // TBMV
-    //cuda2hipRename["cublasStbmv_v2"] = {"hipblasStbmv", CONV_BLAS};
-    //cuda2hipRename["cublasDtbmv_v2"] = {"hipblasDtbmv", CONV_BLAS};
-    //cuda2hipRename["cublasCtbmv_v2"] = {"hipblasCtbmv", CONV_BLAS};
-    //cuda2hipRename["cublasZtbmv_v2"] = {"hipblasZtbmv", CONV_BLAS};
+    //cuda2hipRename["cublasStbmv_v2"] = {"hipblasStbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDtbmv_v2"] = {"hipblasDtbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCtbmv_v2"] = {"hipblasCtbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZtbmv_v2"] = {"hipblasZtbmv", CONV_MATH_FUNC, API_BLAS};
 
     // TPMV
-    //cuda2hipRename["cublasStpmv_v2"] = {"hipblasStpmv", CONV_BLAS};
-    //cuda2hipRename["cublasDtpmv_v2"] = {"hipblasDtpmv", CONV_BLAS};
-    //cuda2hipRename["cublasCtpmv_v2"] = {"hipblasCtpmv", CONV_BLAS};
-    //cuda2hipRename["cublasZtpmv_v2"] = {"hipblasZtpmv", CONV_BLAS};
+    //cuda2hipRename["cublasStpmv_v2"] = {"hipblasStpmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDtpmv_v2"] = {"hipblasDtpmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCtpmv_v2"] = {"hipblasCtpmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZtpmv_v2"] = {"hipblasZtpmv", CONV_MATH_FUNC, API_BLAS};
 
     // TRSV
-    //cuda2hipRename["cublasStrsv_v2"] = {"hipblasStrsv", CONV_BLAS};
-    //cuda2hipRename["cublasDtrsv_v2"] = {"hipblasDtrsv", CONV_BLAS};
-    //cuda2hipRename["cublasCtrsv_v2"] = {"hipblasCtrsv", CONV_BLAS};
-    //cuda2hipRename["cublasZtrsv_v2"] = {"hipblasZtrsv", CONV_BLAS};
+    //cuda2hipRename["cublasStrsv_v2"] = {"hipblasStrsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDtrsv_v2"] = {"hipblasDtrsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCtrsv_v2"] = {"hipblasCtrsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZtrsv_v2"] = {"hipblasZtrsv", CONV_MATH_FUNC, API_BLAS};
 
     // TPSV
-    //cuda2hipRename["cublasStpsv_v2"] = {"hipblasStpsv", CONV_BLAS};
-    //cuda2hipRename["cublasDtpsv_v2"] = {"hipblasDtpsv", CONV_BLAS};
-    //cuda2hipRename["cublasCtpsv_v2"] = {"hipblasCtpsv", CONV_BLAS};
-    //cuda2hipRename["cublasZtpsv_v2"] = {"hipblasZtpsv", CONV_BLAS};
+    //cuda2hipRename["cublasStpsv_v2"] = {"hipblasStpsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDtpsv_v2"] = {"hipblasDtpsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCtpsv_v2"] = {"hipblasCtpsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZtpsv_v2"] = {"hipblasZtpsv", CONV_MATH_FUNC, API_BLAS};
 
     // TBSV
-    //cuda2hipRename["cublasStbsv_v2"] = {"hipblasStbsv", CONV_BLAS};
-    //cuda2hipRename["cublasDtbsv_v2"] = {"hipblasDtbsv", CONV_BLAS};
-    //cuda2hipRename["cublasCtbsv_v2"] = {"hipblasCtbsv", CONV_BLAS};
-    //cuda2hipRename["cublasZtbsv_v2"] = {"hipblasZtbsv", CONV_BLAS};
+    //cuda2hipRename["cublasStbsv_v2"] = {"hipblasStbsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDtbsv_v2"] = {"hipblasDtbsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCtbsv_v2"] = {"hipblasCtbsv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZtbsv_v2"] = {"hipblasZtbsv", CONV_MATH_FUNC, API_BLAS};
 
     // SYMV/HEMV
-    //cuda2hipRename["cublasSsymv_v2"] = {"hipblasSsymv", CONV_BLAS};
-    //cuda2hipRename["cublasDsymv_v2"] = {"hipblasDsymv", CONV_BLAS};
-    //cuda2hipRename["cublasCsymv_v2"] = {"hipblasCsymv", CONV_BLAS};
-    //cuda2hipRename["cublasZsymv_v2"] = {"hipblasZsymv", CONV_BLAS};
-    //cuda2hipRename["cublasChemv_v2"] = {"hipblasChemv", CONV_BLAS};
-    //cuda2hipRename["cublasZhemv_v2"] = {"hipblasZhemv", CONV_BLAS};
+    //cuda2hipRename["cublasSsymv_v2"] = {"hipblasSsymv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDsymv_v2"] = {"hipblasDsymv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCsymv_v2"] = {"hipblasCsymv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZsymv_v2"] = {"hipblasZsymv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasChemv_v2"] = {"hipblasChemv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZhemv_v2"] = {"hipblasZhemv", CONV_MATH_FUNC, API_BLAS};
 
     // SBMV/HBMV
-    //cuda2hipRename["cublasSsbmv_v2"] = {"hipblasSsbmv", CONV_BLAS};
-    //cuda2hipRename["cublasDsbmv_v2"] = {"hpiblasDsbmv", CONV_BLAS};
-    //cuda2hipRename["cublasChbmv_v2"] = {"hipblasChbmv", CONV_BLAS};
-    //cuda2hipRename["cublasZhbmv_v2"] = {"hipblasZhbmv", CONV_BLAS};
+    //cuda2hipRename["cublasSsbmv_v2"] = {"hipblasSsbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDsbmv_v2"] = {"hpiblasDsbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasChbmv_v2"] = {"hipblasChbmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZhbmv_v2"] = {"hipblasZhbmv", CONV_MATH_FUNC, API_BLAS};
 
     // SPMV/HPMV
-    //cuda2hipRename["cublasSspmv_v2"] = {"hipblasSspmv", CONV_BLAS};
-    //cuda2hipRename["cublasDspmv_v2"] = {"hipblasDspmv", CONV_BLAS};
-    //cuda2hipRename["cublasChpmv_v2"] = {"hipblasChpmv", CONV_BLAS};
-    //cuda2hipRename["cublasZhpmv_v2"] = {"hipblasZhpmv", CONV_BLAS};
+    //cuda2hipRename["cublasSspmv_v2"] = {"hipblasSspmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDspmv_v2"] = {"hipblasDspmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasChpmv_v2"] = {"hipblasChpmv", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZhpmv_v2"] = {"hipblasZhpmv", CONV_MATH_FUNC, API_BLAS};
 
     // GER
-    cuda2hipRename["cublasSger_v2"]  = {"hipblasSger", CONV_BLAS};
-    //cuda2hipRename["cublasDger_v2"]  = {"hipblasDger", CONV_BLAS};
-    //cuda2hipRename["cublasCgeru_v2"] = {"hipblasCgeru", CONV_BLAS};
-    //cuda2hipRename["cublasCgerc_v2"] = {"hipblasCgerc", CONV_BLAS};
-    //cuda2hipRename["cublasZgeru_v2"] = {"hipblasZgeru", CONV_BLAS};
-    //cuda2hipRename["cublasZgerc_v2"] = {"hipblasZgerc", CONV_BLAS};
+    cuda2hipRename["cublasSger_v2"]  = {"hipblasSger", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDger_v2"]  = {"hipblasDger", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCgeru_v2"] = {"hipblasCgeru", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCgerc_v2"] = {"hipblasCgerc", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZgeru_v2"] = {"hipblasZgeru", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZgerc_v2"] = {"hipblasZgerc", CONV_MATH_FUNC, API_BLAS};
 
     // SYR/HER
-    //cuda2hipRename["cublasSsyr_v2"] = {"hipblasSsyr", CONV_BLAS};
-    //cuda2hipRename["cublasDsyr_v2"] = {"hipblasDsyr", CONV_BLAS};
-    //cuda2hipRename["cublasCsyr_v2"] = {"hipblasCsyr", CONV_BLAS};
-    //cuda2hipRename["cublasZsyr_v2"] = {"hipblasZsyr", CONV_BLAS};
-    //cuda2hipRename["cublasCher_v2"] = {"hipblasCher", CONV_BLAS};
-    //cuda2hipRename["cublasZher_v2"] = {"hipblasZher", CONV_BLAS};
+    //cuda2hipRename["cublasSsyr_v2"] = {"hipblasSsyr", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDsyr_v2"] = {"hipblasDsyr", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCsyr_v2"] = {"hipblasCsyr", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZsyr_v2"] = {"hipblasZsyr", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCher_v2"] = {"hipblasCher", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZher_v2"] = {"hipblasZher", CONV_MATH_FUNC, API_BLAS};
 
     // SPR/HPR
-    //cuda2hipRename["cublasSspr_v2"] = {"hipblasSspr", CONV_BLAS};
-    //cuda2hipRename["cublasDspr_v2"] = {"hipblasDspr", CONV_BLAS};
-    //cuda2hipRename["cublasChpr_v2"] = {"hipblasChpr", CONV_BLAS};
-    //cuda2hipRename["cublasZhpr_v2"] = {"hipblasZhpr", CONV_BLAS};
+    //cuda2hipRename["cublasSspr_v2"] = {"hipblasSspr", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDspr_v2"] = {"hipblasDspr", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasChpr_v2"] = {"hipblasChpr", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZhpr_v2"] = {"hipblasZhpr", CONV_MATH_FUNC, API_BLAS};
 
     // SYR2/HER2
-    //cuda2hipRename["cublasSsyr2_v2"] = {"hipblasSsyr2", CONV_BLAS};
-    //cuda2hipRename["cublasDsyr2_v2"] = {"hipblasDsyr2", CONV_BLAS};
-    //cuda2hipRename["cublasCsyr2_v2"] = {"hipblasCsyr2", CONV_BLAS};
-    //cuda2hipRename["cublasZsyr2_v2"] = {"hipblasZsyr2", CONV_BLAS};
-    //cuda2hipRename["cublasCher2_v2"] = {"hipblasCher2", CONV_BLAS};
-    //cuda2hipRename["cublasZher2_v2"] = {"hipblasZher2", CONV_BLAS};
+    //cuda2hipRename["cublasSsyr2_v2"] = {"hipblasSsyr2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDsyr2_v2"] = {"hipblasDsyr2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCsyr2_v2"] = {"hipblasCsyr2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZsyr2_v2"] = {"hipblasZsyr2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCher2_v2"] = {"hipblasCher2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZher2_v2"] = {"hipblasZher2", CONV_MATH_FUNC, API_BLAS};
 
     // SPR2/HPR2
-    //cuda2hipRename["cublasSspr2_v2"] = {"hipblasSspr2", CONV_BLAS};
-    //cuda2hipRename["cublasDspr2_v2"] = {"hipblasDspr2", CONV_BLAS};
-    //cuda2hipRename["cublasChpr2_v2"] = {"hipblasChpr2", CONV_BLAS};
-    //cuda2hipRename["cublasZhpr2_v2"] = {"hipblasZhpr2", CONV_BLAS};
+    //cuda2hipRename["cublasSspr2_v2"] = {"hipblasSspr2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDspr2_v2"] = {"hipblasDspr2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasChpr2_v2"] = {"hipblasChpr2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZhpr2_v2"] = {"hipblasZhpr2", CONV_MATH_FUNC, API_BLAS};
 
     // Blas3 (v2) Routines
     // GEMM
-    cuda2hipRename["cublasSgemm_v2"] = {"hipblasSgemm", CONV_BLAS};
-    //cuda2hipRename["cublasDgemm_v2"] = {"hipblasDgemm", CONV_BLAS};
-    cuda2hipRename["cublasCgemm_v2"] = {"hipblasCgemm", CONV_BLAS};
-    //cuda2hipRename["cublasZgemm_v2"] = {"hipblasZgemm", CONV_BLAS};
+    cuda2hipRename["cublasSgemm_v2"] = {"hipblasSgemm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDgemm_v2"] = {"hipblasDgemm", CONV_MATH_FUNC, API_BLAS};
+    cuda2hipRename["cublasCgemm_v2"] = {"hipblasCgemm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZgemm_v2"] = {"hipblasZgemm", CONV_MATH_FUNC, API_BLAS};
 
     //IO in FP16 / FP32, computation in float
     // cublasSgemmEx
 
     // SYRK
-    //cuda2hipRename["cublasSsyrk_v2"] = {"hipblasSsyrk", CONV_BLAS};
-    //cuda2hipRename["cublasDsyrk_v2"] = {"hipblasDsyrk", CONV_BLAS};
-    //cuda2hipRename["cublasCsyrk_v2"] = {"hipblasCsyrk", CONV_BLAS};
-    //cuda2hipRename["cublasZsyrk_v2"] = {"hipblasZsyrk", CONV_BLAS};
+    //cuda2hipRename["cublasSsyrk_v2"] = {"hipblasSsyrk", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDsyrk_v2"] = {"hipblasDsyrk", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCsyrk_v2"] = {"hipblasCsyrk", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZsyrk_v2"] = {"hipblasZsyrk", CONV_MATH_FUNC, API_BLAS};
 
     // HERK
-    //cuda2hipRename["cublasCherk_v2"] = {"hipblasCherk", CONV_BLAS};
-    //cuda2hipRename["cublasZherk_v2"] = {"hipblasZherk", CONV_BLAS};
+    //cuda2hipRename["cublasCherk_v2"] = {"hipblasCherk", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZherk_v2"] = {"hipblasZherk", CONV_MATH_FUNC, API_BLAS};
 
     // SYR2K
-    //cuda2hipRename["cublasSsyr2k_v2"] = {"hipblasSsyr2k", CONV_BLAS};
-    //cuda2hipRename["cublasDsyr2k_v2"] = {"hipblasDsyr2k", CONV_BLAS};
-    //cuda2hipRename["cublasCsyr2k_v2"] = {"hipblasCsyr2k", CONV_BLAS};
-    //cuda2hipRename["cublasZsyr2k_v2"] = {"hipblasZsyr2k", CONV_BLAS};
+    //cuda2hipRename["cublasSsyr2k_v2"] = {"hipblasSsyr2k", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDsyr2k_v2"] = {"hipblasDsyr2k", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCsyr2k_v2"] = {"hipblasCsyr2k", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZsyr2k_v2"] = {"hipblasZsyr2k", CONV_MATH_FUNC, API_BLAS};
 
     // HER2K
-    //cuda2hipRename["cublasCher2k_v2"] = {"hipblasCher2k", CONV_BLAS};
-    //cuda2hipRename["cublasZher2k_v2"] = {"hipblasZher2k", CONV_BLAS};
+    //cuda2hipRename["cublasCher2k_v2"] = {"hipblasCher2k", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZher2k_v2"] = {"hipblasZher2k", CONV_MATH_FUNC, API_BLAS};
 
     // SYMM
-    //cuda2hipRename["cublasSsymm_v2"] = {"hipblasSsymm", CONV_BLAS};
-    //cuda2hipRename["cublasDsymm_v2"] = {"hipblasDsymm", CONV_BLAS};
-    //cuda2hipRename["cublasCsymm_v2"] = {"hipblasCsymm", CONV_BLAS};
-    //cuda2hipRename["cublasZsymm_v2"] = {"hipblasZsymm", CONV_BLAS};
+    //cuda2hipRename["cublasSsymm_v2"] = {"hipblasSsymm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDsymm_v2"] = {"hipblasDsymm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCsymm_v2"] = {"hipblasCsymm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZsymm_v2"] = {"hipblasZsymm", CONV_MATH_FUNC, API_BLAS};
 
     // HEMM
-    //cuda2hipRename["cublasChemm_v2"] = {"hipblasChemm", CONV_BLAS};
-    //cuda2hipRename["cublasZhemm_v2"] = {"hipblasZhemm", CONV_BLAS};
+    //cuda2hipRename["cublasChemm_v2"] = {"hipblasChemm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZhemm_v2"] = {"hipblasZhemm", CONV_MATH_FUNC, API_BLAS};
 
     // TRSM
-    //cuda2hipRename["cublasStrsm_v2"] = {"hipblasStrsm", CONV_BLAS};
-    //cuda2hipRename["cublasDtrsm_v2"] = {"hipblasDtrsm", CONV_BLAS};
-    //cuda2hipRename["cublasCtrsm_v2"] = {"hipblasCtrsm", CONV_BLAS};
-    //cuda2hipRename["cublasZtrsm_v2"] = {"hipblasZtrsm", CONV_BLAS};
+    //cuda2hipRename["cublasStrsm_v2"] = {"hipblasStrsm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDtrsm_v2"] = {"hipblasDtrsm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCtrsm_v2"] = {"hipblasCtrsm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZtrsm_v2"] = {"hipblasZtrsm", CONV_MATH_FUNC, API_BLAS};
 
     // TRMM
-    //cuda2hipRename["cublasStrmm_v2"] = {"hipblasStrmm", CONV_BLAS};
-    //cuda2hipRename["cublasDtrmm_v2"] = {"hipblasDtrmm", CONV_BLAS};
-    //cuda2hipRename["cublasCtrmm_v2"] = {"hipblasCtrmm", CONV_BLAS};
-    //cuda2hipRename["cublasZtrmm_v2"] = {"hipblasZtrmm", CONV_BLAS};
+    //cuda2hipRename["cublasStrmm_v2"] = {"hipblasStrmm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDtrmm_v2"] = {"hipblasDtrmm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCtrmm_v2"] = {"hipblasCtrmm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZtrmm_v2"] = {"hipblasZtrmm", CONV_MATH_FUNC, API_BLAS};
 
     // NRM2
-    //cuda2hipRename["cublasSnrm2_v2"]  = {"hipblasSnrm2", CONV_BLAS};
-    //cuda2hipRename["cublasDnrm2_v2"]  = {"hipblasDnrm2", CONV_BLAS};
-    //cuda2hipRename["cublasScnrm2_v2"] = {"hipblasScnrm2", CONV_BLAS};
-    //cuda2hipRename["cublasDznrm2_v2"] = {"hipblasDznrm2", CONV_BLAS};
+    //cuda2hipRename["cublasSnrm2_v2"]  = {"hipblasSnrm2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDnrm2_v2"]  = {"hipblasDnrm2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasScnrm2_v2"] = {"hipblasScnrm2", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDznrm2_v2"] = {"hipblasDznrm2", CONV_MATH_FUNC, API_BLAS};
 
     // DOT
-    cuda2hipRename["cublasSdot_v2"]  = {"hipblasSdot", CONV_BLAS};
-    cuda2hipRename["cublasDdot_v2"]  = {"hipblasDdot", CONV_BLAS};
-    //cuda2hipRename["cublasCdotu_v2"] = {"hipblasCdotu", CONV_BLAS};
-    //cuda2hipRename["cublasCdotc_v2"] = {"hipblasCdotc", CONV_BLAS};
-    //cuda2hipRename["cublasZdotu_v2"] = {"hipblasZdotu", CONV_BLAS};
-    //cuda2hipRename["cublasZdotc_v2"] = {"hipblasZdotc", CONV_BLAS};
+    cuda2hipRename["cublasSdot_v2"]  = {"hipblasSdot", CONV_MATH_FUNC, API_BLAS};
+    cuda2hipRename["cublasDdot_v2"]  = {"hipblasDdot", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCdotu_v2"] = {"hipblasCdotu", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCdotc_v2"] = {"hipblasCdotc", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZdotu_v2"] = {"hipblasZdotu", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZdotc_v2"] = {"hipblasZdotc", CONV_MATH_FUNC, API_BLAS};
 
     // SCAL
-    cuda2hipRename["cublasSscal_v2"]  = {"hipblasSscal", CONV_BLAS};
-    cuda2hipRename["cublasDscal_v2"]  = {"hipblasDscal", CONV_BLAS};
-    //cuda2hipRename["cublasCscal_v2"]  = {"hipblasCscal", CONV_BLAS};
-    //cuda2hipRename["cublasCsscal_v2"] = {"hipblasCsscal", CONV_BLAS};
-    //cuda2hipRename["cublasZscal_v2"]  = {"hipblasZscal", CONV_BLAS};
-    //cuda2hipRename["cublasZdscal_v2"] = {"hipblasZdscal", CONV_BLAS};
+    cuda2hipRename["cublasSscal_v2"]  = {"hipblasSscal", CONV_MATH_FUNC, API_BLAS};
+    cuda2hipRename["cublasDscal_v2"]  = {"hipblasDscal", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCscal_v2"]  = {"hipblasCscal", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCsscal_v2"] = {"hipblasCsscal", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZscal_v2"]  = {"hipblasZscal", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZdscal_v2"] = {"hipblasZdscal", CONV_MATH_FUNC, API_BLAS};
 
     // AXPY
-    cuda2hipRename["cublasSaxpy_v2"] = {"hipblasSaxpy", CONV_BLAS};
-    //cuda2hipRename["cublasDaxpy_v2"] = {"hipblasDaxpy", CONV_BLAS};
-    //cuda2hipRename["cublasCaxpy_v2"] = {"hipblasCaxpy", CONV_BLAS};
-    //cuda2hipRename["cublasZaxpy_v2"] = {"hipblasZaxpy", CONV_BLAS};
+    cuda2hipRename["cublasSaxpy_v2"] = {"hipblasSaxpy", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDaxpy_v2"] = {"hipblasDaxpy", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCaxpy_v2"] = {"hipblasCaxpy", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZaxpy_v2"] = {"hipblasZaxpy", CONV_MATH_FUNC, API_BLAS};
 
     // COPY
-    cuda2hipRename["cublasScopy_v2"] = {"hipblasScopy", CONV_BLAS};
-    cuda2hipRename["cublasDcopy_v2"] = {"hipblasDcopy", CONV_BLAS};
-    //cuda2hipRename["cublasCcopy_v2"] = {"hipblasCcopy", CONV_BLAS};
-    //cuda2hipRename["cublasZcopy_v2"] = {"hipblasZcopy", CONV_BLAS};
+    cuda2hipRename["cublasScopy_v2"] = {"hipblasScopy", CONV_MATH_FUNC, API_BLAS};
+    cuda2hipRename["cublasDcopy_v2"] = {"hipblasDcopy", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCcopy_v2"] = {"hipblasCcopy", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZcopy_v2"] = {"hipblasZcopy", CONV_MATH_FUNC, API_BLAS};
 
     // SWAP
-    //cuda2hipRename["cublasSswap_v2"] = {"hipblasSswap", CONV_BLAS};
-    //cuda2hipRename["cublasDswap_v2"] = {"hipblasDswap", CONV_BLAS};
-    //cuda2hipRename["cublasCswap_v2"] = {"hipblasCswap", CONV_BLAS};
-    //cuda2hipRename["cublasZswap_v2"] = {"hipblasZswap", CONV_BLAS};
+    //cuda2hipRename["cublasSswap_v2"] = {"hipblasSswap", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDswap_v2"] = {"hipblasDswap", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCswap_v2"] = {"hipblasCswap", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZswap_v2"] = {"hipblasZswap", CONV_MATH_FUNC, API_BLAS};
 
     // AMAX
-    //cuda2hipRename["cublasIsamax_v2"] = {"hipblasIsamax", CONV_BLAS};
-    //cuda2hipRename["cublasIdamax_v2"] = {"hipblasIdamax", CONV_BLAS};
-    //cuda2hipRename["cublasIcamax_v2"] = {"hipblasIcamax", CONV_BLAS};
-    //cuda2hipRename["cublasIzamax_v2"] = {"hipblasIzamax", CONV_BLAS};
+    //cuda2hipRename["cublasIsamax_v2"] = {"hipblasIsamax", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasIdamax_v2"] = {"hipblasIdamax", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasIcamax_v2"] = {"hipblasIcamax", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasIzamax_v2"] = {"hipblasIzamax", CONV_MATH_FUNC, API_BLAS};
 
     // AMIN
-    //cuda2hipRename["cublasIsamin_v2"] = {"hipblasIsamin", CONV_BLAS};
-    //cuda2hipRename["cublasIdamin_v2"] = {"hipblasIdamin", CONV_BLAS};
-    //cuda2hipRename["cublasIcamin_v2"] = {"hipblasIcamin", CONV_BLAS};
-    //cuda2hipRename["cublasIzamin_v2"] = {"hipblasIzamin", CONV_BLAS};
+    //cuda2hipRename["cublasIsamin_v2"] = {"hipblasIsamin", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasIdamin_v2"] = {"hipblasIdamin", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasIcamin_v2"] = {"hipblasIcamin", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasIzamin_v2"] = {"hipblasIzamin", CONV_MATH_FUNC, API_BLAS};
 
     // ASUM
-    cuda2hipRename["cublasSasum_v2"]  = {"hipblasSasum", CONV_BLAS};
-    cuda2hipRename["cublasDasum_v2"]  = {"hipblasDasum", CONV_BLAS};
-    //cuda2hipRename["cublasScasum_v2"] = {"hipblasScasum", CONV_BLAS};
-    //cuda2hipRename["cublasDzasum_v2"] = {"hipblasDzasum", CONV_BLAS};
+    cuda2hipRename["cublasSasum_v2"]  = {"hipblasSasum", CONV_MATH_FUNC, API_BLAS};
+    cuda2hipRename["cublasDasum_v2"]  = {"hipblasDasum", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasScasum_v2"] = {"hipblasScasum", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDzasum_v2"] = {"hipblasDzasum", CONV_MATH_FUNC, API_BLAS};
 
     // ROT
-    //cuda2hipRename["cublasSrot_v2"]  = {"hipblasSrot", CONV_BLAS};
-    //cuda2hipRename["cublasDrot_v2"]  = {"hipblasDrot", CONV_BLAS};
-    //cuda2hipRename["cublasCrot_v2"]  = {"hipblasCrot", CONV_BLAS};
-    //cuda2hipRename["cublasCsrot_v2"] = {"hipblasCsrot", CONV_BLAS};
-    //cuda2hipRename["cublasZrot_v2"]  = {"hipblasZrot", CONV_BLAS};
-    //cuda2hipRename["cublasZdrot_v2"] = {"hipblasZdrot", CONV_BLAS};
+    //cuda2hipRename["cublasSrot_v2"]  = {"hipblasSrot", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDrot_v2"]  = {"hipblasDrot", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCrot_v2"]  = {"hipblasCrot", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCsrot_v2"] = {"hipblasCsrot", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZrot_v2"]  = {"hipblasZrot", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZdrot_v2"] = {"hipblasZdrot", CONV_MATH_FUNC, API_BLAS};
 
     // ROTG
-    //cuda2hipRename["cublasSrotg_v2"] = {"hipblasSrotg", CONV_BLAS};
-    //cuda2hipRename["cublasDrotg_v2"] = {"hipblasDrotg", CONV_BLAS};
-    //cuda2hipRename["cublasCrotg_v2"] = {"hipblasCrotg", CONV_BLAS};
-    //cuda2hipRename["cublasZrotg_v2"] = {"hipblasZrotg", CONV_BLAS};
+    //cuda2hipRename["cublasSrotg_v2"] = {"hipblasSrotg", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDrotg_v2"] = {"hipblasDrotg", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasCrotg_v2"] = {"hipblasCrotg", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasZrotg_v2"] = {"hipblasZrotg", CONV_MATH_FUNC, API_BLAS};
 
     // ROTM
-    //cuda2hipRename["cublasSrotm_v2"] = {"hipblasSrotm", CONV_BLAS};
-    //cuda2hipRename["cublasDrotm_v2"] = {"hipblasDrotm", CONV_BLAS};
+    //cuda2hipRename["cublasSrotm_v2"] = {"hipblasSrotm", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDrotm_v2"] = {"hipblasDrotm", CONV_MATH_FUNC, API_BLAS};
 
     // ROTMG
-    //cuda2hipRename["cublasSrotmg_v2"] = {"hipblasSrotmg", CONV_BLAS};
-    //cuda2hipRename["cublasDrotmg_v2"] = {"hipblasDrotmg", CONV_BLAS};
+    //cuda2hipRename["cublasSrotmg_v2"] = {"hipblasSrotmg", CONV_MATH_FUNC, API_BLAS};
+    //cuda2hipRename["cublasDrotmg_v2"] = {"hipblasDrotmg", CONV_MATH_FUNC, API_BLAS};
   }
 
   struct HipNames {
     StringRef hipName;
     ConvTypes countType;
+    ApiTypes countApiType;
   };
 
   SmallDenseMap<StringRef, HipNames> cuda2hipRename;
@@ -1023,7 +1035,8 @@ StringRef unquoteStr(StringRef s) {
 static void processString(StringRef s, const cuda2hipMap &map,
                           Replacements *Replace, SourceManager &SM,
                           SourceLocation start,
-                          int64_t countReps[ConvTypes::CONV_LAST]) {
+                          int64_t countReps[CONV_LAST],
+                          int64_t countApiReps[API_LAST]) {
   size_t begin = 0;
   while ((begin = s.find("cuda", begin)) != StringRef::npos ||
          (begin = s.find("cublas", begin)) != StringRef::npos) {
@@ -1031,8 +1044,9 @@ static void processString(StringRef s, const cuda2hipMap &map,
     StringRef name = s.slice(begin, end);
     const auto found = map.cuda2hipRename.find(name);
     if (found != map.cuda2hipRename.end()) {
-      countReps[CONV_LITERAL]++;
       StringRef repName = found->second.hipName;
+      countReps[CONV_LITERAL]++;
+      countApiReps[API_RUNTIME]++;
       SourceLocation sl = start.getLocWithOffset(begin + 1);
       Replacement Rep(SM, sl, name.size(), repName);
       Replace->insert(Rep);
@@ -1074,8 +1088,9 @@ public:
       if (is_angled) {
         const auto found = N.cuda2hipRename.find(file_name);
         if (found != N.cuda2hipRename.end()) {
-          countReps[found->second.countType]++;
           StringRef repName = found->second.hipName;
+          countReps[found->second.countType]++;
+          countApiReps[found->second.countApiType]++;
           DEBUG(dbgs() << "Include file found: " << file_name << "\n"
                        << "SourceLocation:"
                        << filename_range.getBegin().printToString(*_sm) << "\n"
@@ -1102,8 +1117,9 @@ public:
           StringRef name = T.getIdentifierInfo()->getName();
           const auto found = N.cuda2hipRename.find(name);
           if (found != N.cuda2hipRename.end()) {
-            countReps[found->second.countType]++;
             StringRef repName = found->second.hipName;
+            countReps[found->second.countType]++;
+            countApiReps[found->second.countApiType]++;
             SourceLocation sl = T.getLocation();
             DEBUG(dbgs() << "Identifier " << name
                          << " found in definition of macro "
@@ -1148,8 +1164,9 @@ public:
               StringRef name = tok.getIdentifierInfo()->getName();
               const auto found = N.cuda2hipRename.find(name);
               if (found != N.cuda2hipRename.end()) {
-                countReps[found->second.countType]++;
                 StringRef repName = found->second.hipName;
+                countReps[found->second.countType]++;
+                countApiReps[found->second.countApiType]++;
                 DEBUG(dbgs()
                       << "Identifier " << name
                       << " found as an actual argument in expansion of macro "
@@ -1170,8 +1187,9 @@ public:
                 const auto found = N.cuda2hipRename.find(name);
                 if (found != N.cuda2hipRename.end()) {
                   sl = sl_macro;
-                  countReps[found->second.countType]++;
                   StringRef repName = found->second.hipName;
+                  countReps[found->second.countType]++;
+                  countApiReps[found->second.countApiType]++;
                   Replacement Rep(*_sm, sl, length, repName);
                   Replace->insert(Rep);
                 }
@@ -1179,7 +1197,7 @@ public:
                 if (tok.is(tok::string_literal)) {
                   StringRef s(tok.getLiteralData(), tok.getLength());
                   processString(unquoteStr(s), N, Replace, *_sm, tok.getLocation(),
-                    countReps);
+                    countReps, countApiReps);
                 }
               }
             }
@@ -1195,7 +1213,8 @@ public:
   void setSourceManager(SourceManager *sm) { _sm = sm; }
   void setPreprocessor(Preprocessor *pp) { _pp = pp; }
   void setMatch(Cuda2HipCallback *match) { Match = match; }
-  int64_t countReps[ConvTypes::CONV_LAST] = {0};
+  int64_t countReps[CONV_LAST]  = { 0 };
+  int64_t countApiReps[API_LAST] = { 0 };
 
 private:
   SourceManager *_sm;
@@ -1214,29 +1233,23 @@ private:
     raw_svector_ostream OS(XStr);
     StringRef initialParamList;
     OS << "hipLaunchParm lp";
-    size_t replacementLength = OS.str().size();
+    size_t repLength = OS.str().size();
     SourceLocation sl = kernelDecl->getNameInfo().getEndLoc();
     SourceLocation kernelArgListStart = Lexer::findLocationAfterToken(
       sl, tok::l_paren, *SM, DefaultLangOptions, true);
     DEBUG(dbgs() << kernelArgListStart.printToString(*SM));
     if (kernelDecl->getNumParams() > 0) {
       const ParmVarDecl *pvdFirst = kernelDecl->getParamDecl(0);
-      const ParmVarDecl *pvdLast =
-        kernelDecl->getParamDecl(kernelDecl->getNumParams() - 1);
+      const ParmVarDecl *pvdLast =  kernelDecl->getParamDecl(kernelDecl->getNumParams() - 1);
       SourceLocation kernelArgListStart(pvdFirst->getLocStart());
       SourceLocation kernelArgListEnd(pvdLast->getLocEnd());
-      SourceLocation stop = Lexer::getLocForEndOfToken(
-        kernelArgListEnd, 0, *SM, DefaultLangOptions);
-      replacementLength +=
-        SM->getCharacterData(stop) - SM->getCharacterData(kernelArgListStart);
-      initialParamList = StringRef(SM->getCharacterData(kernelArgListStart),
-        replacementLength);
+      SourceLocation stop = Lexer::getLocForEndOfToken(kernelArgListEnd, 0, *SM, DefaultLangOptions);
+      repLength += SM->getCharacterData(stop) - SM->getCharacterData(kernelArgListStart);
+      initialParamList = StringRef(SM->getCharacterData(kernelArgListStart), repLength);
       OS << ", " << initialParamList;
     }
-    DEBUG(dbgs() << "initial paramlist: " << initialParamList << "\n"
-      << "new paramlist: " << OS.str() << "\n");
-    Replacement Rep0(*(Result.SourceManager), kernelArgListStart,
-      replacementLength, OS.str());
+    DEBUG(dbgs() << "initial paramlist: " << initialParamList << "\n" << "new paramlist: " << OS.str() << "\n");
+    Replacement Rep0(*(Result.SourceManager), kernelArgListStart, repLength, OS.str());
     Replace->insert(Rep0);
   }
 
@@ -1267,6 +1280,7 @@ private:
         }
         if (bReplace) {
           countReps[found->second.countType]++;
+          countApiReps[found->second.countApiType]++;
           Replacement Rep(*SM, sl, length, repName);
           Replace->insert(Rep);
         }
@@ -1308,11 +1322,8 @@ private:
           SourceLocation sl(arg->getLocStart());
           SourceLocation el(arg->getLocEnd());
           SourceLocation stop = Lexer::getLocForEndOfToken(el, 0, *SM, DefaultLangOptions);
-          StringRef outs(SM->getCharacterData(sl),
-            SM->getCharacterData(stop) - SM->getCharacterData(sl));
-          DEBUG(dbgs() << "args[ " << argno << "]" << outs << " <"
-            << pvd->getType().getAsString() << ">"
-            << "\n");
+          StringRef outs(SM->getCharacterData(sl), SM->getCharacterData(stop) - SM->getCharacterData(sl));
+          DEBUG(dbgs() << "args[ " << argno << "]" << outs << " <" << pvd->getType().getAsString() << ">\n");
           if (pvd->getType().getAsString().compare("dim3") == 0) {
             OS << " dim3(" << outs << "),";
           } else {
@@ -1326,22 +1337,20 @@ private:
         const Expr *arg = launchKernel->getArg(argno);
         SourceLocation sl(arg->getLocStart());
         SourceLocation el(arg->getLocEnd());
-        SourceLocation stop =
-          Lexer::getLocForEndOfToken(el, 0, *SM, DefaultLangOptions);
-        std::string outs(SM->getCharacterData(sl),
-          SM->getCharacterData(stop) - SM->getCharacterData(sl));
+        SourceLocation stop = Lexer::getLocForEndOfToken(el, 0, *SM, DefaultLangOptions);
+        std::string outs(SM->getCharacterData(sl), SM->getCharacterData(stop) - SM->getCharacterData(sl));
         DEBUG(dbgs() << outs << "\n");
         OS << " " << outs << ",";
       }
       XStr.pop_back();
       OS << ")";
-      size_t length =
-        SM->getCharacterData(Lexer::getLocForEndOfToken(
-        launchKernel->getLocEnd(), 0, *SM, DefaultLangOptions)) -
-        SM->getCharacterData(launchKernel->getLocStart());
+      size_t length = SM->getCharacterData(Lexer::getLocForEndOfToken(
+                        launchKernel->getLocEnd(), 0, *SM, DefaultLangOptions)) -
+                        SM->getCharacterData(launchKernel->getLocStart());
       Replacement Rep(*SM, launchKernel->getLocStart(), length, OS.str());
       Replace->insert(Rep);
-      countReps[ConvTypes::CONV_KERN]++;
+      countReps[CONV_KERN]++;
+      countApiReps[API_RUNTIME]++;
       return true;
     }
     return false;
@@ -1361,8 +1370,9 @@ private:
           name = Twine(name + "." + memberName).toStringRef(tmpData);
           const auto found = N.cuda2hipRename.find(name);
           if (found != N.cuda2hipRename.end()) {
-            countReps[found->second.countType]++;
             StringRef repName = found->second.hipName;
+            countReps[found->second.countType]++;
+            countApiReps[found->second.countApiType]++;
             SourceLocation sl = threadIdx->getLocStart();
             SourceManager *SM = Result.SourceManager;
             Replacement Rep(*SM, sl, name.size(), repName);
@@ -1380,8 +1390,9 @@ private:
       StringRef name = enumConstantRef->getDecl()->getNameAsString();
       const auto found = N.cuda2hipRename.find(name);
       if (found != N.cuda2hipRename.end()) {
-        countReps[found->second.countType]++;
         StringRef repName = found->second.hipName;
+        countReps[found->second.countType]++;
+        countApiReps[found->second.countApiType]++;
         SourceLocation sl = enumConstantRef->getLocStart();
         SourceManager *SM = Result.SourceManager;
         Replacement Rep(*SM, sl, name.size(), repName);
@@ -1403,8 +1414,9 @@ private:
       }
       const auto found = N.cuda2hipRename.find(name);
       if (found != N.cuda2hipRename.end()) {
-        countReps[found->second.countType]++;
         StringRef repName = found->second.hipName;
+        countReps[found->second.countType]++;
+        countApiReps[found->second.countApiType]++;
         SourceLocation sl = enumConstantDecl->getLocStart();
         SourceManager *SM = Result.SourceManager;
         Replacement Rep(*SM, sl, name.size(), repName);
@@ -1425,8 +1437,9 @@ private:
       StringRef name = QT.getAsString();
       const auto found = N.cuda2hipRename.find(name);
       if (found != N.cuda2hipRename.end()) {
-        countReps[found->second.countType]++;
         StringRef repName = found->second.hipName;
+        countReps[found->second.countType]++;
+        countApiReps[found->second.countApiType]++;
         SourceLocation sl = typedefVar->getLocStart();
         SourceManager *SM = Result.SourceManager;
         Replacement Rep(*SM, sl, name.size(), repName);
@@ -1445,8 +1458,9 @@ private:
         ->getNameAsString();
       const auto found = N.cuda2hipRename.find(name);
       if (found != N.cuda2hipRename.end()) {
-        countReps[found->second.countType]++;
         StringRef repName = found->second.hipName;
+        countReps[found->second.countType]++;
+        countApiReps[found->second.countApiType]++;
         TypeLoc TL = structVar->getTypeSourceInfo()->getTypeLoc();
         SourceLocation sl = TL.getUnqualifiedLoc().getLocStart();
         SourceManager *SM = Result.SourceManager;
@@ -1465,8 +1479,9 @@ private:
         StringRef name = t->getPointeeCXXRecordDecl()->getName();
         const auto found = N.cuda2hipRename.find(name);
         if (found != N.cuda2hipRename.end()) {
-          countReps[found->second.countType]++;
           StringRef repName = found->second.hipName;
+          countReps[found->second.countType]++;
+          countApiReps[found->second.countApiType]++;
           TypeLoc TL = structVarPtr->getTypeSourceInfo()->getTypeLoc();
           SourceLocation sl = TL.getUnqualifiedLoc().getLocStart();
           SourceManager *SM = Result.SourceManager;
@@ -1487,8 +1502,9 @@ private:
       StringRef name = type->getAsCXXRecordDecl()->getName();
       const auto found = N.cuda2hipRename.find(name);
       if (found != N.cuda2hipRename.end()) {
-        countReps[found->second.countType]++;
         StringRef repName = found->second.hipName;
+        countReps[found->second.countType]++;
+        countApiReps[found->second.countApiType]++;
         TypeLoc TL = typeInfo->getTypeLoc();
         SourceLocation sl = TL.getUnqualifiedLoc().getLocStart();
         SourceManager *SM = Result.SourceManager;
@@ -1533,6 +1549,7 @@ private:
           Replacement Rep(*SM, slStart, repLength, repName);
           Replace->insert(Rep);
           countReps[CONV_MEM]++;
+          countApiReps[API_RUNTIME]++;
         }
       }
       return true;
@@ -1550,8 +1567,9 @@ private:
       }
       const auto found = N.cuda2hipRename.find(name);
       if (found != N.cuda2hipRename.end()) {
-        countReps[found->second.countType]++;
         StringRef repName = found->second.hipName;
+        countReps[found->second.countType]++;
+        countApiReps[found->second.countApiType]++;
         TypeLoc TL = paramDecl->getTypeSourceInfo()->getTypeLoc();
         SourceLocation sl = TL.getUnqualifiedLoc().getLocStart();
         SourceManager *SM = Result.SourceManager;
@@ -1574,8 +1592,9 @@ private:
           : StringRef(QT.getAsString());
         const auto found = N.cuda2hipRename.find(name);
         if (found != N.cuda2hipRename.end()) {
-          countReps[found->second.countType]++;
           StringRef repName = found->second.hipName;
+          countReps[found->second.countType]++;
+          countApiReps[found->second.countApiType]++;
           TypeLoc TL = paramDeclPtr->getTypeSourceInfo()->getTypeLoc();
           SourceLocation sl = TL.getUnqualifiedLoc().getLocStart();
           SourceManager *SM = Result.SourceManager;
@@ -1602,7 +1621,7 @@ private:
       if (sLiteral->getCharByteWidth() == 1) {
         StringRef s = sLiteral->getString();
         SourceManager *SM = Result.SourceManager;
-        processString(s, N, Replace, *SM, sLiteral->getLocStart(), countReps);
+        processString(s, N, Replace, *SM, sLiteral->getLocStart(), countReps, countApiReps);
       }
       return true;
     }
@@ -1640,10 +1659,12 @@ public:
       Replacement Rep(*SM, SM->getLocForStartOfFile(SM->getMainFileID()), 0, repName);
       Replace->insert(Rep);
       countReps[CONV_INCLUDE_CUDA_MAIN_H]++;
+      countApiReps[API_RUNTIME]++;
     }
   }
 
-  int64_t countReps[ConvTypes::CONV_LAST] = {0};
+  int64_t countReps[CONV_LAST]  = { 0 };
+  int64_t countApiReps[API_LAST] = { 0 };
 
 private:
   Replacements *Replace;
@@ -1659,6 +1680,7 @@ void HipifyPPCallbacks::handleEndSource() {
     Replacement Rep(*_sm, _sm->getLocForStartOfFile(_sm->getMainFileID()), 0, repName);
     Replace->insert(Rep);
     countReps[CONV_INCLUDE_CUDA_MAIN_H]++;
+    countApiReps[API_RUNTIME]++;
   }
 }
 
@@ -1747,9 +1769,24 @@ void addAllMatchers(ast_matchers::MatchFinder &Finder, Cuda2HipCallback *Callbac
                             Callback);
 }
 
+void printStats(std::string fileSource, HipifyPPCallbacks &PPCallbacks, Cuda2HipCallback &Callback) {
+  int64_t sum = 0;
+  for (int i = 0; i < CONV_LAST; i++) {
+    sum += Callback.countReps[i] + PPCallbacks.countReps[i];
+  }
+  llvm::outs() << "info: converted " << sum << " CUDA->HIP refs ( ";
+  for (int i = 0; i < CONV_LAST; i++) {
+    llvm::outs() << counterNames[i] << ':' << Callback.countReps[i] + PPCallbacks.countReps[i] << ' ';
+  }
+  llvm::outs() << "), by APIs ( ";
+  for (int i = 0; i < API_LAST; i++) {
+    llvm::outs() << apiNames[i] << ':' << Callback.countApiReps[i] + PPCallbacks.countApiReps[i] << ' ';
+  }
+  llvm::outs() << ") in \'" << fileSource << "\'\n";
+}
+
 int main(int argc, const char **argv) {
   llvm::sys::PrintStackTraceOnErrorSignal();
-  int Result;
   CommonOptionsParser OptionsParser(argc, argv, ToolTemplateCategory, llvm::cl::Required);
   std::vector<std::string> fileSources = OptionsParser.getSourcePathList();
   std::string dst = OutputFilename;
@@ -1770,7 +1807,6 @@ int main(int argc, const char **argv) {
     }
     dst += ".cu";
   }
-
   // copy source file since tooling makes changes "inplace"
   std::ifstream source(fileSources[0], std::ios::binary);
   std::ofstream dest(Inplace ? dst + ".prehip" : dst, std::ios::binary);
@@ -1794,7 +1830,7 @@ int main(int argc, const char **argv) {
   Tool.appendArgumentsAdjuster(getInsertArgumentAdjuster("-resource-dir=" HIPIFY_CLANG_RES));
 #endif
   Tool.appendArgumentsAdjuster(getClangSyntaxOnlyAdjuster());
-  Result = Tool.run(action.get());
+  int Result = Tool.run(action.get());
   Tool.clearArgumentsAdjusters();
 
   LangOptions DefaultLangOptions;
@@ -1825,16 +1861,8 @@ int main(int argc, const char **argv) {
     }
   }
   if (PrintStats) {
-    int64_t sum = 0;
-    for (int i = 0; i < ConvTypes::CONV_LAST; i++) {
-      sum += Callback.countReps[i] + PPCallbacks.countReps[i];
-    }
-    llvm::outs() << "info: converted " << sum << " CUDA->HIP refs ( ";
-    for (int i = 0; i < ConvTypes::CONV_LAST; i++) {
-      llvm::outs() << counterNames[i] << ':'
-                   << Callback.countReps[i] + PPCallbacks.countReps[i] << ' ';
-    }
-    llvm::outs() << ") in \'" << fileSources[0] << "\'\n";
+    printStats(fileSources[0], PPCallbacks, Callback);
   }
+
   return Result;
 }

From 79e88a6af63949afef490ce6d1ac349612446f58 Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Thu, 25 Aug 2016 14:16:53 -0500
Subject: [PATCH 13/56] Added hipModuleGetGlobal and hipModuleLoadData

Change-Id: Iaec873f7d86b72911b6ad32e067a4dfe3d552fe6
---
 include/hcc_detail/hip_hcc.h         |   2 +
 include/hcc_detail/hip_runtime_api.h |   6 ++
 src/hip_module.cpp                   | 112 ++++++++++++++++++++++++++-
 3 files changed, 119 insertions(+), 1 deletion(-)

diff --git a/include/hcc_detail/hip_hcc.h b/include/hcc_detail/hip_hcc.h
index 25e4c56a48..e2a4d709f2 100644
--- a/include/hcc_detail/hip_hcc.h
+++ b/include/hcc_detail/hip_hcc.h
@@ -401,6 +401,8 @@ public:
   hsa_executable_t executable;
   hsa_code_object_t object;
   std::string fileName;
+  void *ptr;
+  size_t size;
 };
 
 
diff --git a/include/hcc_detail/hip_runtime_api.h b/include/hcc_detail/hip_runtime_api.h
index a44d1a299c..34750fea57 100644
--- a/include/hcc_detail/hip_runtime_api.h
+++ b/include/hcc_detail/hip_runtime_api.h
@@ -56,6 +56,8 @@ typedef struct ihipModule_t *hipModule;
 
 typedef struct ihipFunction_t *hipFunction;
 
+typedef void* hipDeviceptr;
+
 typedef struct hipEvent_t {
     struct ihipEvent_t *_handle;
 } hipEvent_t;
@@ -1111,6 +1113,10 @@ hipError_t hipModuleUnload(hipModule module);
 
 hipError_t hipModuleGetFunction(hipFunction *function, hipModule module, const char *kname);
 
+hipError_t hipModuleGetGlobal(hipDeviceptr *dptr, size_t *bytes, hipModule hmod, const char *name);
+
+hipError_t hipModuleLoadData(hipModule *module, const void *image);
+
 hipError_t hipLaunchModuleKernel(hipFunction f,
                               unsigned int gridDimX,
                               unsigned int gridDimY,
diff --git a/src/hip_module.cpp b/src/hip_module.cpp
index 2fcfdc703a..e98cf7458d 100644
--- a/src/hip_module.cpp
+++ b/src/hip_module.cpp
@@ -24,6 +24,9 @@ THE SOFTWARE.
 #include "hcc_detail/hip_hcc.h"
 #include "hcc_detail/trace_helper.h"
 #include <fstream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <elf.h>
 
 //TODO Use Pool APIs from HCC to get memory regions.
 
@@ -51,7 +54,49 @@ namespace hipdrv{
 
 }   // End namespace hipdrv
 
+uint64_t PrintSymbolSizes(const void *emi, const char *name){
+    const Elf64_Ehdr *ehdr = (const Elf64_Ehdr*)emi;
+    if(NULL == ehdr || EV_CURRENT != ehdr->e_version){}
+    const Elf64_Shdr * shdr = (const Elf64_Shdr*)((char*)emi + ehdr->e_shoff);
+    for(uint16_t i=0;i<ehdr->e_shnum;++i){
+        if(shdr[i].sh_type == SHT_SYMTAB){
+            const Elf64_Sym *syms = (const Elf64_Sym*)((char*)emi + shdr[i].sh_offset);
+            assert(syms);
+            uint64_t numSyms = shdr[i].sh_size/shdr[i].sh_entsize;
+            const char* strtab = (const char*)((char*)emi + shdr[shdr[i].sh_link].sh_offset);
+            assert(strtab);
+            for(uint64_t i=0;i<numSyms;++i){
+                const char *symname = strtab + syms[i].st_name;
+                assert(symname);
+                uint64_t size = syms[i].st_size;
+                if(strcmp(name, symname) == 0){
+                    return size;
+                }
+            }
+        }
+    }
+    return 0;
+}
 
+uint64_t ElfSize(const void *emi){
+    const Elf64_Ehdr *ehdr = (const Elf64_Ehdr*)emi;
+    const Elf64_Shdr *shdr = (const Elf64_Shdr*)((char*)emi + ehdr->e_shoff);
+
+    uint64_t max_offset = ehdr->e_shoff;
+    uint64_t total_size = max_offset + ehdr->e_shentsize * ehdr->e_shnum;
+
+    for(uint16_t i=0;i < ehdr->e_shnum;++i){
+        uint64_t cur_offset = static_cast<uint64_t>(shdr[i].sh_offset);
+        if(max_offset < cur_offset){
+            max_offset = cur_offset;
+            total_size = max_offset;
+            if(SHT_NOBITS != shdr[i].sh_type){
+                total_size += static_cast<uint64_t>(shdr[i].sh_size);
+            }
+        }
+    }
+    return total_size;
+}
 
 hipError_t hipModuleLoad(hipModule *module, const char *fname){
     HIP_INIT_API(fname);
@@ -92,10 +137,12 @@ hipError_t hipModuleLoad(hipModule *module, const char *fname){
             if(!ptr){
                 return hipErrorOutOfMemory;
             }
-
+            (*module)->ptr = p;
+            (*module)->size = size;
             in.seekg(0, std::ios::beg);
             std::copy(std::istreambuf_iterator<char>(in),
                       std::istreambuf_iterator<char>(), ptr);
+
             status = hsa_code_object_deserialize(ptr, size, NULL, &(*module)->object);
 
             if(status != HSA_STATUS_SUCCESS){
@@ -233,3 +280,66 @@ Kernel argument preparation.
 
     return ret;
 }
+
+
+hipError_t hipModuleGetGlobal(hipDeviceptr *dptr, size_t *bytes,
+                              hipModule hmod, const char* name){
+    hipError_t ret = hipSuccess;
+    if(dptr == NULL || bytes == NULL){
+        return hipErrorInvalidValue;
+    }
+    if(name == NULL || hmod == NULL){
+        return hipErrorNotInitialized;
+    }
+    else{
+        hipFunction func;
+        hipModuleGetFunction(&func, hmod, name);
+        *bytes = PrintSymbolSizes(hmod->ptr, name) + sizeof(amd_kernel_code_t);
+        *dptr = reinterpret_cast<void*>(func->kernel);
+        return ret;
+    }
+}
+
+hipError_t hipModuleLoadData(hipModule *module, const void *image){
+    hipError_t ret;
+    if(image == NULL || module == NULL){
+        return hipErrorNotInitialized;
+    }else{
+        auto ctx = ihipGetTlsDefaultCtx();
+        *module = new ihipModule_t;
+        int deviceId = ctx->getDevice()->_deviceId;
+        ihipDevice_t *currentDevice = ihipGetDevice(deviceId);
+
+        void *p;
+        uint64_t size = ElfSize(image);
+        hsa_agent_t agent = currentDevice->_hsaAgent;
+        hsa_region_t sysRegion;
+        hsa_status_t status = hsa_agent_iterate_regions(agent, hipdrv::findSystemRegions, &sysRegion);
+        status = hsa_memory_allocate(sysRegion, size, (void**)&p);
+
+        if(status != HSA_STATUS_SUCCESS){
+            return hipErrorOutOfMemory;
+        }
+
+        char *ptr = (char*)p;
+        if(!ptr){
+           return hipErrorOutOfMemory;
+        }
+        (*module)->ptr = p;
+        (*module)->size = size;
+
+        memcpy(ptr, image, size);
+
+        status = hsa_code_object_deserialize(ptr, size, NULL, &(*module)->object);
+
+        if(status != HSA_STATUS_SUCCESS){
+            return hipErrorSharedObjectInitFailed;
+        }
+
+        status = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, NULL, &(*module)->executable);
+        if(status != HSA_STATUS_SUCCESS){
+            return hipErrorNotInitialized;
+        }
+    }
+    return ret;
+}

From 842553a6e1f47a4ebd78701ca0ede76fe67b5c55 Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Thu, 25 Aug 2016 14:34:41 -0500
Subject: [PATCH 14/56] Changed how hipEvent_t is typedefed internall - Mapped
 hipEvent_t directly to ihipEvent_t* instead of a handle

Change-Id: I5a8bcca0ef962932e0738c03eb1fc914d23022ae
---
 include/hcc_detail/hip_runtime_api.h |  4 +---
 src/hip_event.cpp                    | 19 ++++++++++---------
 src/hip_hcc.cpp                      |  2 +-
 3 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/include/hcc_detail/hip_runtime_api.h b/include/hcc_detail/hip_runtime_api.h
index 34750fea57..0d08563fde 100644
--- a/include/hcc_detail/hip_runtime_api.h
+++ b/include/hcc_detail/hip_runtime_api.h
@@ -58,9 +58,7 @@ typedef struct ihipFunction_t *hipFunction;
 
 typedef void* hipDeviceptr;
 
-typedef struct hipEvent_t {
-    struct ihipEvent_t *_handle;
-} hipEvent_t;
+typedef struct ihipEvent_t *hipEvent_t;
 
 
 /**
diff --git a/src/hip_event.cpp b/src/hip_event.cpp
index ca30c3c62b..4324583690 100644
--- a/src/hip_event.cpp
+++ b/src/hip_event.cpp
@@ -33,13 +33,14 @@ hipError_t ihipEventCreate(hipEvent_t* event, unsigned flags)
 
     // TODO - support hipEventDefault, hipEventBlockingSync, hipEventDisableTiming
     if (flags == 0) {
-        ihipEvent_t *eh = event->_handle = new ihipEvent_t();
+        ihipEvent_t *eh = new ihipEvent_t();
 
         eh->_state  = hipEventStatusCreated;
         eh->_stream = NULL;
         eh->_flags  = flags;
         eh->_timestamp  = 0;
         eh->_copySeqId  = 0;
+        *event = eh;
     } else {
         e = hipErrorInvalidValue;
     }
@@ -71,7 +72,7 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream)
 {
     HIP_INIT_API(event, stream);
 
-    ihipEvent_t *eh = event._handle;
+    ihipEvent_t *eh = event;
     if (eh && eh->_state != hipEventStatusUnitialized)   {
         eh->_stream = stream;
 
@@ -106,10 +107,10 @@ hipError_t hipEventDestroy(hipEvent_t event)
 {
     HIP_INIT_API(event);
 
-    event._handle->_state  = hipEventStatusUnitialized;
+    event->_state  = hipEventStatusUnitialized;
 
-    delete event._handle;
-    event._handle = NULL;
+    delete event;
+    event = NULL;
 
     // TODO - examine return additional error codes
     return ihipLogStatus(hipSuccess);
@@ -121,7 +122,7 @@ hipError_t hipEventSynchronize(hipEvent_t event)
 {
     HIP_INIT_API(event);
 
-    ihipEvent_t *eh = event._handle;
+    ihipEvent_t *eh = event;
 
     if (eh) {
         if (eh->_state == hipEventStatusUnitialized) {
@@ -150,8 +151,8 @@ hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop)
 {
     HIP_INIT_API(ms, start, stop);
 
-    ihipEvent_t *start_eh = start._handle;
-    ihipEvent_t *stop_eh = stop._handle;
+    ihipEvent_t *start_eh = start;
+    ihipEvent_t *stop_eh = stop;
 
     ihipSetTs(start);
     ihipSetTs(stop);
@@ -195,7 +196,7 @@ hipError_t hipEventQuery(hipEvent_t event)
 {
     HIP_INIT_API(event);
 
-    ihipEvent_t *eh = event._handle;
+    ihipEvent_t *eh = event;
 
     // TODO-stream - need to read state of signal here:  The event may have become ready after recording..
     // TODO-HCC - use get_hsa_signal here.
diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp
index 4508b6057a..589b9ef47e 100644
--- a/src/hip_hcc.cpp
+++ b/src/hip_hcc.cpp
@@ -1493,7 +1493,7 @@ const char *ihipErrorString(hipError_t hip_error)
 
 void ihipSetTs(hipEvent_t e)
 {
-    ihipEvent_t *eh = e._handle;
+    ihipEvent_t *eh = e;
     if (eh->_state == hipEventStatusRecorded) {
         // already recorded, done:
         return;

From 17289ca67db7e43a64cc5d5a3b5860a534b387ac Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Fri, 26 Aug 2016 10:35:30 +0530
Subject: [PATCH 15/56] hipcc needs to link against supc++

Change-Id: Ica0949099a0bdfe2a493341dc1cd96ec93f34f11
---
 bin/hipcc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/hipcc b/bin/hipcc
index c51e351382..c37a704f4c 100755
--- a/bin/hipcc
+++ b/bin/hipcc
@@ -105,7 +105,7 @@ if ($HIP_PLATFORM eq "hcc") {
     }
 
     # Satisfy HCC dependencies
-    $HIPLDFLAGS .= " -lc++abi";
+    $HIPLDFLAGS .= " -lc++abi -lsupc++";
     $HIPLDFLAGS .= " -L$HSA_PATH/lib -L$ROCM_PATH/lib -lhsa-runtime64 -lhc_am -lhsakmt";
 
     # Handle ROCm target platform

From 524eb687d3c9fb99b96b1e0cc26fa4d1ffd4985e Mon Sep 17 00:00:00 2001
From: Rahul Garg <rahul.garg@amd.com>
Date: Fri, 26 Aug 2016 13:51:33 +0530
Subject: [PATCH 16/56] Addition of hipCtxEnablePeerAccess and
 hipCtxDisablePeerAccess functions

Change-Id: I381c8cbbde17eae7d9bb5d4cb1596cebf4bda039
---
 include/hcc_detail/hip_runtime_api.h  | 29 +++++++++++++++++++++++++++
 include/nvcc_detail/hip_runtime_api.h | 10 +++++++++
 src/hip_peer.cpp                      | 18 +++++++++++++----
 3 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/include/hcc_detail/hip_runtime_api.h b/include/hcc_detail/hip_runtime_api.h
index 0d08563fde..1c50c33359 100644
--- a/include/hcc_detail/hip_runtime_api.h
+++ b/include/hcc_detail/hip_runtime_api.h
@@ -1084,6 +1084,35 @@ hipError_t hipCtxSynchronize ( void );
 
 hipError_t hipCtxGetFlags ( unsigned int* flags );
 
+/**
+ * @brief Enables direct access to memory allocations in a peer context.
+ *
+ * Memory which already allocated on peer device will be mapped into the address space of the current device.  In addition, all
+ * future memory allocations on peerDeviceId will be mapped into the address space of the current device when the memory is allocated.
+ * The peer memory remains accessible from the current device until a call to hipDeviceDisablePeerAccess or hipDeviceReset.
+ *
+ *
+ * @param [in] peerCtx
+ * @param [in] flags
+ *
+ * Returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue,
+ * @returns #hipErrorPeerAccessAlreadyEnabled if peer access is already enabled for this device.
+ * @warning PeerToPeer support is experimental.
+ */
+hipError_t  hipCtxEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags);
+
+/**
+ * @brief Disable direct access from current device's virtual address space to memory allocations physically located on a peer device through contex.Disables direct access to memory allocations in a peer context and unregisters any registered allocations.
+ *
+ * Returns hipErrorPeerAccessNotEnabled if direct access to memory on peerDevice has not yet been enabled from the current device.
+ *
+ * @param [in] peerCtx
+ *
+ * @returns #hipSuccess, #hipErrorPeerAccessNotEnabled
+ * @warning PeerToPeer support is experimental.
+ */
+hipError_t  hipCtxDisablePeerAccess (hipCtx_t peerCtx);
+
 
 // TODO-ctx
 /**
diff --git a/include/nvcc_detail/hip_runtime_api.h b/include/nvcc_detail/hip_runtime_api.h
index a51bca2d02..67091b8bfd 100644
--- a/include/nvcc_detail/hip_runtime_api.h
+++ b/include/nvcc_detail/hip_runtime_api.h
@@ -474,6 +474,16 @@ inline static hipError_t  hipDeviceEnablePeerAccess ( int  peerDevice, unsigned
     return hipCUDAErrorTohipError(cudaDeviceEnablePeerAccess ( peerDevice, flags ));
 }
 
+inline static hipError_t  hipCtxDisablePeerAccess ( hipCtx_t peerCtx )
+{
+    return hipCUDAErrorTohipError(cudaCtxDisablePeerAccess ( peerCtx ));
+}
+
+inline static hipError_t  hipCtxEnablePeerAccess ( hipCtx_t peerCtx, unsigned int  flags )
+{
+    return hipCUDAErrorTohipError(cudaCtxEnablePeerAccess ( peerCtx, flags ));
+}
+
 inline static hipError_t hipMemcpyPeer ( void* dst, int  dstDevice, const void* src, int  srcDevice, size_t count )
 {
     return hipCUDAErrorTohipError(cudaMemcpyPeer ( dst, dstDevice, src, srcDevice, count ));
diff --git a/src/hip_peer.cpp b/src/hip_peer.cpp
index fd51599815..8f75d99918 100644
--- a/src/hip_peer.cpp
+++ b/src/hip_peer.cpp
@@ -67,7 +67,7 @@ hipError_t hipDeviceCanAccessPeer (int* canAccessPeer, hipCtx_t thisCtx, hipCtx_
 //---
 // Disable visibility of this device into memory allocated on peer device.
 // Remove this device from peer device peerlist.
-hipError_t hipDeviceDisablePeerAccess (hipCtx_t peerCtx)
+hipError_t ihipDisablePeerAccess (hipCtx_t peerCtx)
 {
     HIP_INIT_API(peerCtx);
 
@@ -109,7 +109,7 @@ hipError_t hipDeviceDisablePeerAccess (hipCtx_t peerCtx)
 //---
 // Allow the current device to see all memory allocated on peerDevice.
 // This should add this device to the peer-device peer list.
-hipError_t hipDeviceEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags)
+hipError_t ihipEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags)
 {
     HIP_INIT_API(peerCtx, flags);
 
@@ -175,7 +175,7 @@ hipError_t hipDeviceDisablePeerAccess (int peerDeviceId)
 {
     HIP_INIT_API(peerDeviceId);
 
-    return hipDeviceDisablePeerAccess(ihipGetPrimaryCtx(peerDeviceId));
+    return ihipDisablePeerAccess(ihipGetPrimaryCtx(peerDeviceId));
 }
 
 
@@ -183,7 +183,7 @@ hipError_t hipDeviceEnablePeerAccess (int peerDeviceId, unsigned int flags)
 {
     HIP_INIT_API(peerDeviceId, flags);
 
-    return hipDeviceEnablePeerAccess(ihipGetPrimaryCtx(peerDeviceId), flags);
+    return ihipEnablePeerAccess(ihipGetPrimaryCtx(peerDeviceId), flags);
 }
 
 
@@ -200,6 +200,16 @@ hipError_t hipMemcpyPeerAsync (void* dst, int  dstDevice, const void* src, int
     return hipMemcpyPeerAsync(dst, ihipGetPrimaryCtx(dstDevice), src, ihipGetPrimaryCtx(srcDevice), sizeBytes, stream);
 }
 
+hipError_t hipCtxEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags)
+{
+    HIP_INIT_API(peerCtx, flags);
 
+    return ihipEnablePeerAccess(peerCtx, flags);
+}
 
+hipError_t hipCtxDisablePeerAccess (hipCtx_t peerCtx)
+{
+    HIP_INIT_API(peerCtx);
 
+    return ihipDisablePeerAccess(peerCtx);
+}

From 51081400871177e20d94bb2f1414ac3b61190e8c Mon Sep 17 00:00:00 2001
From: Rahul Garg <rahul.garg@amd.com>
Date: Fri, 26 Aug 2016 14:10:36 +0530
Subject: [PATCH 17/56] NVCC path support for hipCtxXXX APIs

Change-Id: Ic7dbfbdaee9d00c0de1363c50758e5e29a96a8b2
---
 include/nvcc_detail/hip_runtime_api.h | 70 +++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/include/nvcc_detail/hip_runtime_api.h b/include/nvcc_detail/hip_runtime_api.h
index 67091b8bfd..69a8df9bf9 100644
--- a/include/nvcc_detail/hip_runtime_api.h
+++ b/include/nvcc_detail/hip_runtime_api.h
@@ -509,6 +509,76 @@ inline static hipError_t hipEventQuery(hipEvent_t event)
 	return hipCUDAErrorTohipError(cudaEventQuery(event));
 }
 
+inline static hipError_t  hipCtxCreate(hipCtx_t *ctx, unsigned int flags,  hipDevice_t device)
+{
+    return hipCUDAErrorTohipError(cuCtxCreate ( ctx,flags,device ));
+}
+
+inline static hipError_t  hipCtxDestroy(hipCtx_t ctx)
+{
+    return hipCUDAErrorTohipError(cuCtxDestroy ( ctx ));
+}
+
+inline static hipError_t  hipCtxPopCurrent(hipCtx_t* ctx)
+{
+    return hipCUDAErrorTohipError(cuCtxPopCurrent ( ctx ));
+}
+
+inline static hipError_t  hipCtxPushCurrent(hipCtx_t ctx)
+{
+    return hipCUDAErrorTohipError(cuCtxPushCurrent ( ctx ));
+}
+
+inline static hipError_t  hipCtxSetCurrent(hipCtx_t ctx)
+{
+    return hipCUDAErrorTohipError(cuCtxSetCurrent ( ctx ));
+}
+
+inline static hipError_t  hipCtxGetCurrent(hipCtx_t* ctx)
+{
+    return hipCUDAErrorTohipError(cuCtxGetCurrent ( ctx ));
+}
+
+inline static hipError_t  hipCtxGetDevice(hipDevice_t *device)
+{
+    return hipCUDAErrorTohipError(cuCtxGetDevice ( device ));
+}
+
+inline static hipError_t  hipCtxGetApiVersion (hipCtx_t ctx,int *apiVersion)
+{
+    return hipCUDAErrorTohipError(cuCtxGetApiVersion ( ctx,apiVersion ));
+}
+
+inline static hipError_t  hipCtxGetCacheConfig ( hipFuncCache *cacheConfig )
+{
+    return hipCUDAErrorTohipError(cuCtxGetCacheConfig ( cacheConfig ));
+}
+
+inline static hipError_t  hipCtxSetCacheConfig ( hipFuncCache cacheConfig )
+{
+    return hipCUDAErrorTohipError(cuCtxSetCacheConfig ( cacheConfig ));
+}
+
+inline static hipError_t  hipCtxSetSharedMemConfig ( hipSharedMemConfig config )
+{
+    return hipCUDAErrorTohipError(cuCtxSetSharedMemConfig ( config ));
+}
+
+inline static hipError_t  hipCtxGetSharedMemConfig ( hipSharedMemConfig * pConfig )
+{
+    return hipCUDAErrorTohipError(cuCtxGetSharedMemConfig ( pConfig ));
+}
+
+inline static hipError_t  hipCtxSynchronize ( void )
+{
+    return hipCUDAErrorTohipError(cuCtxSynchronize ( void ));
+}
+
+inline static hipError_t  hipCtxGetFlags ( unsigned int* flags )
+{
+    return hipCUDAErrorTohipError(cuCtxGetFlags ( flags ));
+}
+
 #ifdef __cplusplus
 }
 #endif

From ae77d4b6d7f2eacb736ca659f0d369c18a479468 Mon Sep 17 00:00:00 2001
From: Rahul Garg <rahul.garg@amd.com>
Date: Fri, 26 Aug 2016 15:32:49 +0530
Subject: [PATCH 18/56] Resolved errors due to hipCtxXXX APIs

Change-Id: Iffac0095c4352864eca622ea318d2291571b5153
---
 include/nvcc_detail/hip_runtime_api.h | 78 ++++++++++++++++-----------
 1 file changed, 47 insertions(+), 31 deletions(-)

diff --git a/include/nvcc_detail/hip_runtime_api.h b/include/nvcc_detail/hip_runtime_api.h
index 69a8df9bf9..3ef12a294b 100644
--- a/include/nvcc_detail/hip_runtime_api.h
+++ b/include/nvcc_detail/hip_runtime_api.h
@@ -22,7 +22,7 @@ THE SOFTWARE.
 #pragma once
 
 #include <cuda_runtime_api.h>
-
+#include <cuda.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -60,6 +60,10 @@ hipMemcpyHostToHost
 
 typedef cudaEvent_t hipEvent_t;
 typedef cudaStream_t hipStream_t;
+typedef CUcontext hipCtx_t;
+typedef CUsharedconfig hipSharedMemConfig;
+typedef CUfunc_cache hipFuncCache;
+typedef CUdevice hipDevice_t;
 //typedef cudaChannelFormatDesc hipChannelFormatDesc;
 #define hipChannelFormatDesc cudaChannelFormatDesc
 
@@ -85,6 +89,18 @@ switch(cuError) {
 };
 }
 
+inline static hipError_t hipCUResultTohipError(CUresult cuError) { //TODO Populate further
+switch(cuError) {
+    case CUDA_SUCCESS                            : return hipSuccess;
+    case CUDA_ERROR_OUT_OF_MEMORY                : return hipErrorMemoryAllocation            ;
+    case CUDA_ERROR_INVALID_VALUE                : return hipErrorInvalidValue                ;
+    case CUDA_ERROR_INVALID_DEVICE               : return hipErrorInvalidDevice               ;
+    case CUDA_ERROR_DEINITIALIZED            : return hipErrorInitializationError         ;
+    case CUDA_ERROR_NO_DEVICE                  : return hipErrorNoDevice                    ;
+    default                                      : return hipErrorUnknown;  // Note - translated error.
+};
+}
+
 // TODO   match the error enum names of hip and cuda
 inline static cudaError_t hipErrorToCudaError(hipError_t hError) {
 switch(hError) {
@@ -347,20 +363,6 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att
     return hipCUDAErrorTohipError(cerror);
 }
 
-template<class T>
-inline static hipError_t hipOccupancyMaxPotentialBlockSize(
-        int *minGridSize,
-        int *blockSize,
-        T func,
-        size_t dynamicSMemSize = 0,
-        int blockSizeLimit = 0,
-        unsigned int flags = 0
-        ){
-    cudaError_t cerror;
-    cerror =  cudaOccupancyMaxPotentialBlockSize(minGridSize, blockSize, func, dynamicSMemSize, blockSizeLimit, flags);
-    return hipCUDAErrorTohipError(cerror);
-}
-
 inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
         int *numBlocks,
         const void* func,
@@ -476,12 +478,12 @@ inline static hipError_t  hipDeviceEnablePeerAccess ( int  peerDevice, unsigned
 
 inline static hipError_t  hipCtxDisablePeerAccess ( hipCtx_t peerCtx )
 {
-    return hipCUDAErrorTohipError(cudaCtxDisablePeerAccess ( peerCtx ));
+    return hipCUResultTohipError(cuCtxDisablePeerAccess ( peerCtx ));
 }
 
 inline static hipError_t  hipCtxEnablePeerAccess ( hipCtx_t peerCtx, unsigned int  flags )
 {
-    return hipCUDAErrorTohipError(cudaCtxEnablePeerAccess ( peerCtx, flags ));
+    return hipCUResultTohipError(cuCtxEnablePeerAccess ( peerCtx, flags ));
 }
 
 inline static hipError_t hipMemcpyPeer ( void* dst, int  dstDevice, const void* src, int  srcDevice, size_t count )
@@ -511,72 +513,72 @@ inline static hipError_t hipEventQuery(hipEvent_t event)
 
 inline static hipError_t  hipCtxCreate(hipCtx_t *ctx, unsigned int flags,  hipDevice_t device)
 {
-    return hipCUDAErrorTohipError(cuCtxCreate ( ctx,flags,device ));
+    return hipCUResultTohipError(cuCtxCreate ( ctx,flags,device ));
 }
 
 inline static hipError_t  hipCtxDestroy(hipCtx_t ctx)
 {
-    return hipCUDAErrorTohipError(cuCtxDestroy ( ctx ));
+    return hipCUResultTohipError(cuCtxDestroy ( ctx ));
 }
 
 inline static hipError_t  hipCtxPopCurrent(hipCtx_t* ctx)
 {
-    return hipCUDAErrorTohipError(cuCtxPopCurrent ( ctx ));
+    return hipCUResultTohipError(cuCtxPopCurrent ( ctx ));
 }
 
 inline static hipError_t  hipCtxPushCurrent(hipCtx_t ctx)
 {
-    return hipCUDAErrorTohipError(cuCtxPushCurrent ( ctx ));
+    return hipCUResultTohipError(cuCtxPushCurrent ( ctx ));
 }
 
 inline static hipError_t  hipCtxSetCurrent(hipCtx_t ctx)
 {
-    return hipCUDAErrorTohipError(cuCtxSetCurrent ( ctx ));
+    return hipCUResultTohipError(cuCtxSetCurrent ( ctx ));
 }
 
 inline static hipError_t  hipCtxGetCurrent(hipCtx_t* ctx)
 {
-    return hipCUDAErrorTohipError(cuCtxGetCurrent ( ctx ));
+    return hipCUResultTohipError(cuCtxGetCurrent ( ctx ));
 }
 
 inline static hipError_t  hipCtxGetDevice(hipDevice_t *device)
 {
-    return hipCUDAErrorTohipError(cuCtxGetDevice ( device ));
+    return hipCUResultTohipError(cuCtxGetDevice ( device ));
 }
 
 inline static hipError_t  hipCtxGetApiVersion (hipCtx_t ctx,int *apiVersion)
 {
-    return hipCUDAErrorTohipError(cuCtxGetApiVersion ( ctx,apiVersion ));
+    return hipCUResultTohipError(cuCtxGetApiVersion ( ctx,(unsigned int*)apiVersion ));
 }
 
 inline static hipError_t  hipCtxGetCacheConfig ( hipFuncCache *cacheConfig )
 {
-    return hipCUDAErrorTohipError(cuCtxGetCacheConfig ( cacheConfig ));
+    return hipCUResultTohipError(cuCtxGetCacheConfig ( cacheConfig ));
 }
 
 inline static hipError_t  hipCtxSetCacheConfig ( hipFuncCache cacheConfig )
 {
-    return hipCUDAErrorTohipError(cuCtxSetCacheConfig ( cacheConfig ));
+    return hipCUResultTohipError(cuCtxSetCacheConfig ( cacheConfig ));
 }
 
 inline static hipError_t  hipCtxSetSharedMemConfig ( hipSharedMemConfig config )
 {
-    return hipCUDAErrorTohipError(cuCtxSetSharedMemConfig ( config ));
+    return hipCUResultTohipError(cuCtxSetSharedMemConfig ( config ));
 }
 
 inline static hipError_t  hipCtxGetSharedMemConfig ( hipSharedMemConfig * pConfig )
 {
-    return hipCUDAErrorTohipError(cuCtxGetSharedMemConfig ( pConfig ));
+    return hipCUResultTohipError(cuCtxGetSharedMemConfig ( pConfig ));
 }
 
 inline static hipError_t  hipCtxSynchronize ( void )
 {
-    return hipCUDAErrorTohipError(cuCtxSynchronize ( void ));
+    return hipCUResultTohipError(cuCtxSynchronize ( ));
 }
 
 inline static hipError_t  hipCtxGetFlags ( unsigned int* flags )
 {
-    return hipCUDAErrorTohipError(cuCtxGetFlags ( flags ));
+    return hipCUResultTohipError(cuCtxGetFlags ( flags ));
 }
 
 #ifdef __cplusplus
@@ -585,6 +587,20 @@ inline static hipError_t  hipCtxGetFlags ( unsigned int* flags )
 
 #ifdef __CUDACC__
 
+template<class T>
+inline static hipError_t hipOccupancyMaxPotentialBlockSize(
+        int *minGridSize,
+        int *blockSize,
+        T func,
+        size_t dynamicSMemSize = 0,
+        int blockSizeLimit = 0,
+        unsigned int flags = 0
+        ){
+    cudaError_t cerror;
+    cerror =  cudaOccupancyMaxPotentialBlockSize(minGridSize, blockSize, func, dynamicSMemSize, blockSizeLimit, flags);
+    return hipCUDAErrorTohipError(cerror);
+}
+
 template <class T, int dim, enum cudaTextureReadMode readMode>
 inline static hipError_t  hipBindTexture(size_t *offset,
 			                             const struct texture<T, dim, readMode> &tex,

From 1211cc931ce75da600ea05aca5a919797a86a1ff Mon Sep 17 00:00:00 2001
From: Rahul Garg <rahul.garg@amd.com>
Date: Fri, 26 Aug 2016 19:03:23 +0530
Subject: [PATCH 19/56] Added logic to update primary ctx when ctx stack is
 empty, updated hipCtxDestroy and ctxGetCurrent functions

Change-Id: Ia0a8943c121bc1279788a1cfa9be59af614b04a6
---
 include/hcc_detail/hip_hcc.h |  2 ++
 src/hip_context.cpp          | 38 +++++++++++++++++++++++++++++++-----
 2 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/include/hcc_detail/hip_hcc.h b/include/hcc_detail/hip_hcc.h
index e2a4d709f2..48f7dac318 100644
--- a/include/hcc_detail/hip_hcc.h
+++ b/include/hcc_detail/hip_hcc.h
@@ -166,6 +166,7 @@ class ihipCtx_t;
 // generate trace string that can be output to stderr or to ATP file.
 #define HIP_INIT_API(...) \
 	std::call_once(hip_initialized, ihipInit);\
+    ihipCtxStackUpdate();\
     API_TRACE(__VA_ARGS__);
 
 #define ihipLogStatus(hipStatus) \
@@ -666,6 +667,7 @@ extern const char *ihipErrorString(hipError_t);
 extern ihipCtx_t    *ihipGetTlsDefaultCtx();
 extern void          ihipSetTlsDefaultCtx(ihipCtx_t *ctx);
 extern hipError_t    ihipSynchronize(void);
+extern hipError_t    ihipCtxStackUpdate();
 
 extern ihipDevice_t *ihipGetDevice(int);
 ihipCtx_t * ihipGetPrimaryCtx(unsigned deviceIndex);
diff --git a/src/hip_context.cpp b/src/hip_context.cpp
index 33caf610bc..c0c98de827 100644
--- a/src/hip_context.cpp
+++ b/src/hip_context.cpp
@@ -29,6 +29,18 @@ THE SOFTWARE.
 // Stack of contexts
 thread_local std::stack<ihipCtx_t *>  tls_ctxStack;
 
+hipError_t ihipCtxStackUpdate()
+{
+    //HIP_INIT_API();
+    printf("Inside ihipCtxStackUpdate\n");
+    hipError_t e = hipSuccess;
+
+    if(tls_ctxStack.empty()) {
+            tls_ctxStack.push(ihipGetTlsDefaultCtx());
+    }
+
+    return ihipLogStatus(e);
+}
 
 hipError_t hipInit(unsigned int flags)
 {
@@ -92,11 +104,20 @@ hipError_t hipCtxDestroy(hipCtx_t ctx)
 {
     hipError_t e = hipSuccess;
     ihipCtx_t* currentCtx= ihipGetTlsDefaultCtx();
-    if(currentCtx == ctx) {
-        //need to destroy the ctx associated with calling thread
-        tls_ctxStack.pop();
+    ihipCtx_t* primaryCtx= ((ihipDevice_t*)ctx->getDevice())->_primaryCtx;
+    if(primaryCtx== ctx)
+    {
+        e = hipErrorInvalidValue;
     }
-    delete ctx; //As per CUDA docs , attempting to access ctx from those threads which has this ctx as current, will result in the error HIP_ERROR_CONTEXT_IS_DESTROYED.
+    else
+    {
+        if(currentCtx == ctx) {
+            //need to destroy the ctx associated with calling thread
+            tls_ctxStack.pop();
+        }
+        delete ctx; //As per CUDA docs , attempting to access ctx from those threads which has this ctx as current, will result in the error HIP_ERROR_CONTEXT_IS_DESTROYED.
+    }
+
     return ihipLogStatus(e);
 }
 
@@ -135,11 +156,18 @@ hipError_t hipCtxPushCurrent(hipCtx_t ctx)
 hipError_t hipCtxGetCurrent(hipCtx_t* ctx)
 {
     hipError_t e = hipSuccess;
-
+#if 0
     *ctx = ihipGetTlsDefaultCtx();
     if(*ctx == nullptr) {
         *ctx = NULL;    //TODO - is it required? Can return nullptr?
     }
+#endif
+    if(!tls_ctxStack.empty()) {
+        *ctx= tls_ctxStack.top();
+    }
+    else {
+        *ctx = NULL;
+    }
     return ihipLogStatus(e);
 }
 

From 8b918b065a8dac81094d6fb7881bd23bc076e048 Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Fri, 26 Aug 2016 10:32:01 -0500
Subject: [PATCH 20/56] Added NVCC support and name changes - Added NVCC
 support for module APIs - Changed hipFunction and hipModule data types to
 hipFunction_t and hipModule_t - Created new intenal ihipModuleGetFunction as
 it is used twice - Changed test to match with the new data types

Change-Id: I300a1c7fd40ed7065b1b8b9de97e3a06b96ed729
---
 include/hcc_detail/hip_runtime_api.h  | 16 +++++++-------
 include/nvcc_detail/hip_runtime_api.h | 31 +++++++++++++++++++++++++++
 src/hip_module.cpp                    | 22 ++++++++++++-------
 tests/src/hipModule.cpp               |  7 ++----
 4 files changed, 55 insertions(+), 21 deletions(-)

diff --git a/include/hcc_detail/hip_runtime_api.h b/include/hcc_detail/hip_runtime_api.h
index 1c50c33359..a2b5587ee6 100644
--- a/include/hcc_detail/hip_runtime_api.h
+++ b/include/hcc_detail/hip_runtime_api.h
@@ -52,9 +52,9 @@ typedef struct ihipDevice_t *hipDevice_t;
 
 typedef struct ihipStream_t *hipStream_t;
 
-typedef struct ihipModule_t *hipModule;
+typedef struct ihipModule_t *hipModule_t;
 
-typedef struct ihipFunction_t *hipFunction;
+typedef struct ihipFunction_t *hipFunction_t;
 
 typedef void* hipDeviceptr;
 
@@ -1134,17 +1134,17 @@ hipError_t hipDeviceGetFromId(hipDevice_t *device, int deviceId);
 hipError_t hipDriverGetVersion(int *driverVersion) ;
 
 
-hipError_t hipModuleLoad(hipModule *module, const char *fname);
+hipError_t hipModuleLoad(hipModule_t *module, const char *fname);
 
-hipError_t hipModuleUnload(hipModule module);
+hipError_t hipModuleUnload(hipModule_t module);
 
-hipError_t hipModuleGetFunction(hipFunction *function, hipModule module, const char *kname);
+hipError_t hipModuleGetFunction(hipFunction_t *function, hipModule_t module, const char *kname);
 
-hipError_t hipModuleGetGlobal(hipDeviceptr *dptr, size_t *bytes, hipModule hmod, const char *name);
+hipError_t hipModuleGetGlobal(hipDeviceptr *dptr, size_t *bytes, hipModule_t hmod, const char *name);
 
-hipError_t hipModuleLoadData(hipModule *module, const void *image);
+hipError_t hipModuleLoadData(hipModule_t *module, const void *image);
 
-hipError_t hipLaunchModuleKernel(hipFunction f,
+hipError_t hipLaunchModuleKernel(hipFunction_t f,
                               unsigned int gridDimX,
                               unsigned int gridDimY,
                               unsigned int gridDimZ,
diff --git a/include/nvcc_detail/hip_runtime_api.h b/include/nvcc_detail/hip_runtime_api.h
index 3ef12a294b..d017e37495 100644
--- a/include/nvcc_detail/hip_runtime_api.h
+++ b/include/nvcc_detail/hip_runtime_api.h
@@ -64,6 +64,10 @@ typedef CUcontext hipCtx_t;
 typedef CUsharedconfig hipSharedMemConfig;
 typedef CUfunc_cache hipFuncCache;
 typedef CUdevice hipDevice_t;
+typedef CUModule hipModule_t;
+typedef CUFunction hipFunction_t;
+typedef CUdeviceptr hipDeviceptr;
+
 //typedef cudaChannelFormatDesc hipChannelFormatDesc;
 #define hipChannelFormatDesc cudaChannelFormatDesc
 
@@ -581,6 +585,33 @@ inline static hipError_t  hipCtxGetFlags ( unsigned int* flags )
     return hipCUResultTohipError(cuCtxGetFlags ( flags ));
 }
 
+inline static hipError_t hipModuleGetFunction(hipFunction_t *function,
+                         hipModule_t module, const char *kname)
+{
+    return hipCUResultTohipError(cuModuleGetFunction(function, module, kname));
+}
+
+inline static hipError_t hipModuleGetGlobal(hipDeviceptr *dptr, size_t *bytes,
+                         hipModule_t hmod, const char* name)
+{
+    return hipCUResultTohipError(cuModuleGetGlobal(dptr, bytes, hmod, name));
+}
+
+inline static hipError_t hipModuleLoad(hipModule_t *module, const char* fname)
+{
+    return hipCUResultTohipError(cuModuleLoad(module, fname));
+}
+
+inline static hipError_t hipModuleLoadData(hipModule_t *module, const void *image)
+{
+    return hipCUResultTohipError(cuModuleLoadData(module, image));
+}
+
+inline static hipError_t hipModuleUnload(hipModule_t hmod)
+{
+    return hipCUResultTohipError(cuModuleUnload(hmod));
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/hip_module.cpp b/src/hip_module.cpp
index e98cf7458d..ba9e94d9fb 100644
--- a/src/hip_module.cpp
+++ b/src/hip_module.cpp
@@ -98,7 +98,7 @@ uint64_t ElfSize(const void *emi){
     return total_size;
 }
 
-hipError_t hipModuleLoad(hipModule *module, const char *fname){
+hipError_t hipModuleLoad(hipModule_t *module, const char *fname){
     HIP_INIT_API(fname);
     hipError_t ret = hipSuccess;
     *module = new ihipModule_t;
@@ -159,7 +159,7 @@ hipError_t hipModuleLoad(hipModule *module, const char *fname){
     return ret;
 }
 
-hipError_t hipModuleUnload(hipModule hmod){
+hipError_t hipModuleUnload(hipModule_t hmod){
     hipError_t ret = hipSuccess;
     hsa_status_t status = hsa_executable_destroy(hmod->executable);
     if(status != HSA_STATUS_SUCCESS){ret = hipErrorInvalidValue; }
@@ -169,7 +169,7 @@ hipError_t hipModuleUnload(hipModule hmod){
     return ret;
 }
 
-hipError_t hipModuleGetFunction(hipFunction *func, hipModule hmod, const char *name){
+hipError_t ihipModuleGetFunction(hipFunction_t *func, hipModule_t hmod, const char *name){
     HIP_INIT_API(name);
     auto ctx = ihipGetTlsDefaultCtx();
     hipError_t ret = hipSuccess;
@@ -217,7 +217,13 @@ hipError_t hipModuleGetFunction(hipFunction *func, hipModule hmod, const char *n
     return ret;
 }
 
-hipError_t hipLaunchModuleKernel(hipFunction f,
+hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod,
+                                const char *name)
+{
+    return ihipModuleGetFunction(hfunc, hmod, name);
+}
+
+hipError_t hipLaunchModuleKernel(hipFunction_t f,
             uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ,
             uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
             uint32_t sharedMemBytes, hipStream_t hStream,
@@ -283,7 +289,7 @@ Kernel argument preparation.
 
 
 hipError_t hipModuleGetGlobal(hipDeviceptr *dptr, size_t *bytes,
-                              hipModule hmod, const char* name){
+                              hipModule_t hmod, const char* name){
     hipError_t ret = hipSuccess;
     if(dptr == NULL || bytes == NULL){
         return hipErrorInvalidValue;
@@ -292,15 +298,15 @@ hipError_t hipModuleGetGlobal(hipDeviceptr *dptr, size_t *bytes,
         return hipErrorNotInitialized;
     }
     else{
-        hipFunction func;
-        hipModuleGetFunction(&func, hmod, name);
+        hipFunction_t func;
+        ihipModuleGetFunction(&func, hmod, name);
         *bytes = PrintSymbolSizes(hmod->ptr, name) + sizeof(amd_kernel_code_t);
         *dptr = reinterpret_cast<void*>(func->kernel);
         return ret;
     }
 }
 
-hipError_t hipModuleLoadData(hipModule *module, const void *image){
+hipError_t hipModuleLoadData(hipModule_t *module, const void *image){
     hipError_t ret;
     if(image == NULL || module == NULL){
         return hipErrorNotInitialized;
diff --git a/tests/src/hipModule.cpp b/tests/src/hipModule.cpp
index 784a01c1d0..94daa2fc5a 100644
--- a/tests/src/hipModule.cpp
+++ b/tests/src/hipModule.cpp
@@ -21,9 +21,6 @@ THE SOFTWARE.
 #include<hip_runtime_api.h>
 #include<iostream>
 #include<fstream>
-#include<hsa/hsa.h>
-#include<hsa/amd_hsa_kernel_code.h>
-#include<hsa/hsa_ven_amd_loader.h>
 #include<vector>
 
 #define LEN 64
@@ -53,8 +50,8 @@ int main(){
 
   hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
   hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
-  hipModule Module;
-  hipFunction Function;
+  hipModule_t Module;
+  hipFunction_t Function;
   hipModuleLoad(&Module, fileName);
   hipModuleGetFunction(&Function, Module, kernel_name);
   hipStream_t stream;

From b94b8dbd48c4a44f10cdfd45987097f46ebe71ee Mon Sep 17 00:00:00 2001
From: Ben Sander <ben.sander@amd.com>
Date: Thu, 25 Aug 2016 12:36:52 -0500
Subject: [PATCH 21/56] Doc update to clarify supported / unsupported features

---
 docs/markdown/hip_faq.md             | 21 +++++++++++++++++----
 docs/markdown/hip_kernel_language.md |  5 +----
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/docs/markdown/hip_faq.md b/docs/markdown/hip_faq.md
index 8b50a9a8c7..cca0965ce2 100644
--- a/docs/markdown/hip_faq.md
+++ b/docs/markdown/hip_faq.md
@@ -42,21 +42,34 @@ HIP provides the following:
 The HIP API documentation describes each API and its limitations, if any, compared with the equivalent CUDA API.
 
 ### What is not supported?
-#### Run-time features
+#### Runtime/Driver API features
+At a high-level, the following features are not supported:
 - Textures 
-- MemcpyToSymbol functions
 - Dynamic parallelism (CUDA 5.0)
 - Managed memory (CUDA 6.5)
 - Graphics interoperation with OpenGL or Direct3D
+- CUDA Driver API (Under Development)
+- CUDA IPC Functions (Under Development)
+
 - CUDA array, mipmappedArray and pitched memory
-- CUDA Driver API
+- MemcpyToSymbol functions
+
+See the [API Support Table](CUDA_Runtime_API_functions_supported_by_HIP.md) for more detailed information.
 
 #### Kernel language features
 - Device-side dynamic memory allocations (malloc, free, new, delete) (CUDA 4.0)
 - Virtual functions, indirect functions and try/catch (CUDA 4.0)
 - `__prof_trigger` 
 - PTX assembly (CUDA 4.0)
-- Several kernel features are under development.  See the [HIP Kernel Language](hip_kernel_language.md) for more information.
+- Several kernel features are under development.  See the [HIP Kernel Language](hip_kernel_language.md) for more information.  These include:
+  - printf
+  - assert__
+  - `__restrict__`
+  - `__launch_bounds__`
+  - `__threadfence*_`, `__syncthreads*`
+  - Unbounded loop unroll
+
+
 
 ### Is HIP a drop-in replacement for CUDA?
 No. HIP provides porting tools which do most of the work do convert CUDA code into portable C++ code that uses the HIP APIs.
diff --git a/docs/markdown/hip_kernel_language.md b/docs/markdown/hip_kernel_language.md
index 88f0237706..e74a010a60 100644
--- a/docs/markdown/hip_kernel_language.md
+++ b/docs/markdown/hip_kernel_language.md
@@ -156,7 +156,7 @@ The `__constant__` keyword is supported. The host writes constant memory before
 ### `__shared__` 
 The `__shared__` keyword is supported.
 
-`extern __shared__` allows the host to dynamically allocate shared memory and is specified as a launch parameter. This feature is under development.
+`extern __shared__` allows the host to dynamically allocate shared memory and is specified as a launch parameter.  HIP uses an alternate syntax based on the HIP_DYNAMIC_SHARED macro.
 
 ### `__managed__`
 Managed memory, including the `__managed__` keyword, are not supported in HIP.
@@ -537,7 +537,6 @@ HIP supports the following atomic operations.
 ### Caveats and Features Under-Development:
 
 - HIP enables atomic operations on 32-bit integers. Additionally, it supports an atomic float add. AMD hardware, however, implements the float add using a CAS loop, so this function may not perform efficiently.
-- wrapping increment and decrement are under development.
 
 ## Warp Cross-Lane Functions
 
@@ -573,8 +572,6 @@ Applications can test whether the target platform supports the any/all instructi
 
 ### Warp Shuffle Functions 
 
-The following warp shuffle instructions are under development.
-
 Half-float shuffles are not supported. The default width is warpSize---see [Warp Cross-Lane Functions](#warp-cross-lane-functions). Applications should not assume the warpSize is 32 or 64.
 
 ```

From d95c293182859a207f334ed327b2898eabbcd2ad Mon Sep 17 00:00:00 2001
From: Ben Sander <ben.sander@amd.com>
Date: Fri, 26 Aug 2016 10:46:09 -0500
Subject: [PATCH 22/56] Add docs on porting driver API

---
 docs/markdown/hip_porting_driver_api.md | 173 ++++++++++++++++++++++++
 1 file changed, 173 insertions(+)
 create mode 100644 docs/markdown/hip_porting_driver_api.md

diff --git a/docs/markdown/hip_porting_driver_api.md b/docs/markdown/hip_porting_driver_api.md
new file mode 100644
index 0000000000..0fca5771b8
--- /dev/null
+++ b/docs/markdown/hip_porting_driver_api.md
@@ -0,0 +1,173 @@
+# Porting CUDA Driver API
+
+## Introduction to the CUDA Driver and Runtime APIs
+CUDA provides a separate CUDA Driver and Runtime APIs.  The two APis have significant overlap in functionality:  
+- Both APIs support events, streams, memory management, memory copy, and error handling.
+- Both APIs deliver similar performance.  
+- Driver APIs calls begin with the prefix `cu` while Runtime APIs begin with the prefix `cuda`.  For example, the Driver API API contains `cuEventCreate` while the Runtime API contains `cudaEventCreate`, with similar functionality.
+- The Driver API defines a different but largely overlapping error code code space than the Runtime API, and uses a different coding convention.  For example, Driver API defines `CUDA_ERROR_INVALID_VALUE` while the Runtime API defines `cudaErrorInvalidValue`
+
+
+The Driver API offers two additional pieces of functionality not provided by the Runtime API: cuModule and cuCtx APIs.
+
+### cuModule API
+The Module section of the Driver API provides additional control over how and when accelerator
+ code objects are loaded.
+For example, the driver API allows code objects to be loaded from files or memory pointers.
+Symbols for kernels or global data can be extracted from the loaded code objects.
+In contrast, the Runtime API automatically loads and (if necessary) compiles all of the 
+kernels from a executable binary when run.
+In this mode, NVCC must be used to compile kernel code so the automatic loading can function correctly.
+
+Both Driver and Runtime APIs define a function for launching kernels (called `cuLaunchKernel` or `cudaLaunchKernel`.
+The kernel arguments and the execution configuration (grid dimensions, group dimensions, dynamic shared memory, and stream) are passed as arguments to the launch function.
+The Runtime additionally provides the `<<< >>>` syntax for launching kernels, which resembles
+a special function call and is easier to use than explicit launch API (in particular with respect to
+handling of kernel arguments).
+However, this syntax is not standard C++ and is available only when NVCC is used to compile the host code.  
+
+The Module features are useful in an environment which generate the code objects directly, such as a new
+accelerator language front-end.  Here, NVCC is not used.   Instead, the environment may have a 
+different kernel language or different compilation flow.
+Other environments have many kernels and do not want them to be all loaded automatically.
+The Module functions can be used to load the generated code objects and launch kernels.
+As we will see below, HIP defines a Module API which provides similar explict control over code
+object managemenet.
+
+### cuCtx API
+The Driver API defines "Context" and "Devices" as separate entities.  
+Contexts contain a single device, and a device can theoretically have multiple contexts.  
+Each context contains a set of streams and events specific to the context.  
+Historically contexts also defined a unique address space for the GPU, though this may not longer be the case in Unified Memory platforms (since the CPU and all the devices in the same process share a single unified address space).  
+The Context APIs also provide a mechanism to switch between devices, which allowed
+a single CPU thread to send commands to different GPUs.  HIP as well as a recent versions
+of CUDA Runtime provide other mechanisms to accomplish this feat - for example using streams or
+`cudaSetDevice`.
+
+The CUDA Runtime API unifies the Context API with the Device API.  This simplifies the APIs
+and has little loss of functionality since each Context can contain a single device, 
+and the benefits of multiple contexts has been replaced with other interfaces.   HIP provides
+a context API to facilitate easy porting from existing Driver codes.  
+In HIP, the Ctx functions largely provide an alternate syntax for changing the active device. 
+Most new applications will prefer to use `hipSetDevice` or the stream APIs.
+
+## HIP Module and Ctx APIs
+
+Rather than present two separate APIs, HIP extends the HIP API with new APIs for Modules 
+and Ctx control.   
+
+### hipModule API
+
+Like the CUDA Driver API, the Module API provides additional control 
+over how code is loaded, including options to load code from files or from in-memory pointers.  
+NVCC and HCC target different architectures and use different code object formats : NVCC
+is `cubin` or `ptx` files, while the HCC path is the `hsaco` format.   The external
+compilers which generate these code objects are responsible for generating and loading
+the correct code object for each platform.  Notably, there is not a fat binary format that
+can contain code for both NVCC and HCC platforms.     The following table summarizes the
+formats used on each platform:
+
+| Format        | APIs                         | NVCC               | HCC               |
+| ---           | ---                          | ---                | ---               |
+| Code Object   | hipModuleLoad, hipModuleLoadData | .cubin or PTX text | .hsaco            |
+| Fat Binary    | hipModuleLoadFatBin          | .fatbin            | Under Development | 
+
+hipcc uses NVCC and HCC to compile host codes.  Both of these may embed code objects
+into the final executable, and these code objects will be automatically loaded when
+the application starts.
+The hipModule API can be used to load additional code objects, and in this way
+provides an extended capability to the automatically loaded code objects.  HCC allows
+both of these capabilities to be used together, if desired.  Of course it is possible
+to create a program with no kernels and thus no automatic loading.
+
+
+### hipCtx API
+HIP provides a `Ctx` API as a thin layer over the existing Device functions.  This Ctx API
+can be used to set the current context, or to query properties of the device associated with
+the context.  The current context is implicitly used by other APIs such as `hipStreamCreate`.
+
+### hipify translation of CUDA Driver API
+The hipify tool will convert CUDA Driver APIs for streams, events, memory management to 
+the equivalent HIP driver calls.   For example, `cuEventCreate` will be translated to 
+`hipEventCreate`.  Hipify also converts error code from the Driver namespace and coding
+convention to the equivalent HIP error code.  Thus, HIP unifies the APis for these common functions.
+[hipify support for translating driver API is Under Development]
+
+The memory copy APIs require additional explanation.  The CUDA driver includes the memory
+direction in the name of the API (ie `cuMemcpyH2D`) while the CUDA driver API provides
+a single memory copy API with a parameter that specifies the direction and additionally
+supports a "default" direction where the runtime determines the direction automatically.
+HIP provides APis with both styles: for example, `hipMemcpyH2D` as well as `hipMemcpy`.
+The first flavor may be faster in some cases since they avoid host overhead to detect the 
+different memory directions.
+
+HIP defines a single error space, and uses camel-case for all errors (i.e. `hipErrorInvalidValue`).  
+
+### HCC Implementation Notes
+#### .hsaco
+The .hsaco format used by HCC is described in more detail [here](https://github.com/RadeonOpenCompute/ROCm-ComputeABI-Doc).
+An example and blog that show how to use the format is [here](http://gpuopen.com/rocm-with-harmony-combining-opencl-hcc-hsa-in-a-single-program).  hsaco can be generated by hcc + extractkernel tool,
+cloc, the GCN assembler, or other tools.
+
+#### Address Spaces
+HCC defines a process-wide address space where the CPU and all devices allocate
+addresses from a single unified pool.  Thus addresses may be shared between contexts, and 
+unlike the original CUDA definition a new context does not create a new address space for
+the device.   
+
+#### Additional Information
+HCC allocates staging buffers (used for unpinned copies) on a per-device basis.
+
+### NVCC Implementation Notes
+
+#### Interoperation between HIP and CUDA Driver
+CUDA applications may want to mix CUDA driver code with HIP code (see example below).  This
+table shows the type equivalence to enable this interaction.
+
+|**HIP Type** |**CU Driver Type**|  **CUDA Runtime Type** |
+| ----        | ----             | ---- | 
+| hipModule   |  CUmodule        |   			|
+| hipFunction |  CUfunction      | 			|
+| hipCtx_t    |  CUcontext       | 			|
+| hipDevice_t |  CUdevice        |  			|
+| hipStream_t |  CUstream        | cudaStream_t         |
+| hipEvent_t  |  CUevent         | cudaEvent_t          |
+| hipArray    |  CUarray         | cudaArray            |
+
+
+
+#### Compilation Flags
+The hipModule interface does not support the hipModuleLoadEx function, which is used to control PTX compilaton options.
+HCC does not use PTX and does not support the same compilation options.
+In fact, HCC code objects always contain fully compiled ISA and do not require additional compilation as part of the load step.
+Code which requires this functionaly should use platform-specific coding, calling `cuModuleLoadEx` 
+on the NVCC path and hipModuleLoad on the hcc path.  For example:
+
+```
+hipModule module;
+void *imagePtr = ... ;  // Somehow populate data pointer with code object
+
+#ifdef __HIP_PLATFORM_NVCC__
+// Use CUDA driver API but write to hipModule since they are same type:
+const int numOptions = 1;
+CUJit_option options[numOptions];
+void * optionValues[numOptions];
+
+options[0] = CU_JIT_MAX_REGISTERS;
+unsigned maxRegs=15;
+optionValues[0] = (void*) (&maxRegs);
+
+cuModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues);
+
+#else // __HIP_PLATFORM_HCC__
+
+// HCC path does not support or require JIT options, so just load the module.
+hipModuleLoadData(&module, imagePtr);
+
+#endif
+
+// Back to unified code - both paths above loaded the "module" variable.
+hipFunction k;
+hipModuleGetFunction(&k, module, "myKernel");
+```
+

From 4152746e269041c5565d972573b14132d7107fbe Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Fri, 26 Aug 2016 13:11:01 -0500
Subject: [PATCH 23/56] Added explicit memory copy direction apis - Fixed stale
 printf in context api - Added 4 sync memcpy apis   1. hipMemcpyHtoD   2.
 hipMemcpyDtoH   3. hipMemcpyDtoD   4. hipMemcpyHtoH - Added test for added
 apis

Change-Id: I4a9c382445b62631f8d0bcbb9a670322288b72b1
---
 include/hcc_detail/hip_hcc.h         |  5 +-
 include/hcc_detail/hip_runtime_api.h |  8 +++
 src/hip_context.cpp                  |  1 -
 src/hip_hcc.cpp                      | 10 ++--
 src/hip_memory.cpp                   | 84 ++++++++++++++++++++++++++++
 tests/src/hipDrvMemcpy.cpp           | 26 +++++++++
 6 files changed, 125 insertions(+), 9 deletions(-)
 create mode 100644 tests/src/hipDrvMemcpy.cpp

diff --git a/include/hcc_detail/hip_hcc.h b/include/hcc_detail/hip_hcc.h
index 48f7dac318..449d9bc522 100644
--- a/include/hcc_detail/hip_hcc.h
+++ b/include/hcc_detail/hip_hcc.h
@@ -421,8 +421,9 @@ typedef uint64_t SeqNum_t ;
     ~ihipStream_t();
 
     // kind is hipMemcpyKind
-    void copySync (LockedAccessor_StreamCrit_t &crit, void* dst, const void* src, size_t sizeBytes, unsigned kind);
-    void locked_copySync (void* dst, const void* src, size_t sizeBytes, unsigned kind);
+    void copySync (LockedAccessor_StreamCrit_t &crit, void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn = true);
+    void locked_copySync (void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn = true);
+
 
     void copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind);
 
diff --git a/include/hcc_detail/hip_runtime_api.h b/include/hcc_detail/hip_runtime_api.h
index a2b5587ee6..7f79b64ac9 100644
--- a/include/hcc_detail/hip_runtime_api.h
+++ b/include/hcc_detail/hip_runtime_api.h
@@ -844,6 +844,14 @@ hipError_t hipHostFree(void* ptr);
  */
 hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind);
 
+hipError_t hipMemcpyHtoD(hipDeviceptr dst, hipDeviceptr src, size_t sizeBytes);
+
+hipError_t hipMemcpyDtoH(hipDeviceptr dst, hipDeviceptr src, size_t sizeBytes);
+
+hipError_t hipMemcpyDtoD(hipDeviceptr dst, hipDeviceptr src, size_t sizeBytes);
+
+hipError_t hipMemcpyHtoH(hipDeviceptr dst, hipDeviceptr src, size_t sizeBytes);
+
 
 /**
  *  @brief Copies @p sizeBytes bytes from the memory area pointed to by @p src to the memory area pointed to by @p offset bytes from the start of symbol @p symbol.
diff --git a/src/hip_context.cpp b/src/hip_context.cpp
index c0c98de827..1caa3b4e42 100644
--- a/src/hip_context.cpp
+++ b/src/hip_context.cpp
@@ -32,7 +32,6 @@ thread_local std::stack<ihipCtx_t *>  tls_ctxStack;
 hipError_t ihipCtxStackUpdate()
 {
     //HIP_INIT_API();
-    printf("Inside ihipCtxStackUpdate\n");
     hipError_t e = hipSuccess;
 
     if(tls_ctxStack.empty()) {
diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp
index 589b9ef47e..09cf46f8a7 100644
--- a/src/hip_hcc.cpp
+++ b/src/hip_hcc.cpp
@@ -1563,7 +1563,7 @@ void ihipStream_t::setAsyncCopyAgents(unsigned kind, ihipCommand_t *commandType,
 }
 
 
-void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const void* src, size_t sizeBytes, unsigned kind)
+void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn)
 {
     ihipCtx_t *ctx = this->getCtx();
     const ihipDevice_t *device = ctx->getDevice();
@@ -1582,7 +1582,7 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const
     bool dstInDeviceMem = dstPtrInfo._isInDeviceMem;
 
     // Resolve default to a specific Kind so we know which algorithm to use:
-    if (kind == hipMemcpyDefault) {
+    if (kind == hipMemcpyDefault && resolveOn) {
         kind = resolveMemcpyDirection(srcTracked, dstTracked, srcInDeviceMem, dstInDeviceMem);
     };
 
@@ -1753,14 +1753,12 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const
 
 
 // Sync copy that acquires lock:
-void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, unsigned kind)
+void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn)
 {
     LockedAccessor_StreamCrit_t crit (_criticalData);
-    copySync(crit, dst, src, sizeBytes, kind);
+    copySync(crit, dst, src, sizeBytes, kind, resolveOn);
 }
 
-
-
 void ihipStream_t::copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind)
 {
     LockedAccessor_StreamCrit_t crit(_criticalData);
diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp
index c0282dc372..50520a4e0e 100644
--- a/src/hip_memory.cpp
+++ b/src/hip_memory.cpp
@@ -446,12 +446,96 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind
         e = ex._code;
     }
 
+    return ihipLogStatus(e);
+}
 
+hipError_t hipMemcpyHtoD(hipDeviceptr dst, hipDeviceptr src, size_t sizeBytes)
+{
+    HIP_INIT_API(dst, src, sizeBytes);
+
+    hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
+
+    hc::completion_future marker;
+
+    hipError_t e = hipSuccess;
+
+    try {
+
+        stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyHostToDevice, false);
+    }
+    catch (ihipException ex) {
+        e = ex._code;
+    }
 
     return ihipLogStatus(e);
 }
 
 
+hipError_t hipMemcpyDtoH(hipDeviceptr dst, hipDeviceptr src, size_t sizeBytes)
+{
+    HIP_INIT_API(dst, src, sizeBytes);
+
+    hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
+
+    hc::completion_future marker;
+
+    hipError_t e = hipSuccess;
+
+    try {
+
+        stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyDeviceToHost, false);
+    }
+    catch (ihipException ex) {
+        e = ex._code;
+    }
+
+    return ihipLogStatus(e);
+}
+
+hipError_t hipMemcpyDtoD(hipDeviceptr dst, hipDeviceptr src, size_t sizeBytes)
+{
+    HIP_INIT_API(dst, src, sizeBytes);
+
+    hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
+
+    hc::completion_future marker;
+
+    hipError_t e = hipSuccess;
+
+    try {
+
+        stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyDeviceToDevice, false);
+    }
+    catch (ihipException ex) {
+        e = ex._code;
+    }
+
+    return ihipLogStatus(e);
+}
+
+hipError_t hipMemcpyHtoH(hipDeviceptr dst, hipDeviceptr src, size_t sizeBytes)
+{
+    HIP_INIT_API(dst, src, sizeBytes);
+
+    hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
+
+    hc::completion_future marker;
+
+    hipError_t e = hipSuccess;
+
+    try {
+
+        stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyHostToHost, false);
+    }
+    catch (ihipException ex) {
+        e = ex._code;
+    }
+
+    return ihipLogStatus(e);
+}
+
+
+
 /**
  * @result #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidMemcpyDirection,
  * @result #hipErrorInvalidValue : If dst==NULL or src==NULL, or other bad argument.
diff --git a/tests/src/hipDrvMemcpy.cpp b/tests/src/hipDrvMemcpy.cpp
new file mode 100644
index 0000000000..55c9b18818
--- /dev/null
+++ b/tests/src/hipDrvMemcpy.cpp
@@ -0,0 +1,26 @@
+#include<iostream>
+#include<hip/hip_runtime.h>
+#include<hip/hip_runtime_api.h>
+
+#define LEN 1024
+#define SIZE LEN<<2
+
+int main(){
+  int *A, *B, *C;
+  hipDeviceptr Ad, Bd;
+  A = new int[LEN];
+  B = new int[LEN];
+  C = new int[LEN];
+  for(int i=0;i<LEN;i++){
+    A[i] = i;
+  }
+  hipMalloc((void**)&Ad, SIZE);
+  hipMalloc((void**)&Bd, SIZE);
+  hipMemcpyHtoD(Ad, A, SIZE);
+  hipMemcpyDtoD(Bd, Ad, SIZE);
+  hipMemcpyDtoH(B, Bd, SIZE);
+  hipMemcpyHtoH(C, B,SIZE);
+  for(int i=0;i<16;i++){
+    std::cout<<A[i]<<" "<<C[i]<<std::endl;
+  }
+}

From 8155f0e036b595300e31ede6d462e75acaddd98c Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Sat, 27 Aug 2016 11:13:56 -0500
Subject: [PATCH 24/56] added memset and memcpy device functions - Added memcpy
 and memset device functions - Added test for memcpy and memset

Change-Id: Icd21a8dd964953b86d5e92889bf1664bee647219
---
 include/hcc_detail/hip_runtime.h | 51 ++++++++++++++++++++++++++++++++
 tests/src/hipDeviceMemcpy.cpp    | 42 ++++++++++++++++++++++++++
 2 files changed, 93 insertions(+)
 create mode 100644 tests/src/hipDeviceMemcpy.cpp

diff --git a/include/hcc_detail/hip_runtime.h b/include/hcc_detail/hip_runtime.h
index d009bec35b..740e60d0eb 100644
--- a/include/hcc_detail/hip_runtime.h
+++ b/include/hcc_detail/hip_runtime.h
@@ -568,6 +568,55 @@ __device__ void  __threadfence_system(void);
 #define hipGridDim_y   (hc_get_num_groups(1))
 #define hipGridDim_z   (hc_get_num_groups(2))
 
+// loop unrolling
+__device__ static inline void* memcpy(void* dst, void* src, size_t size)
+{
+    uint64_t i = 0;
+    uint64_t totalLength = size/sizeof(uint32_t);
+    for(i=hipThreadIdx_x+hipBlockIdx_x*hipBlockDim_x;
+                  i<(totalLength/4);
+                  i = i + hipBlockDim_x * hipGridDim_x)
+    {
+        ((uint32_t*)dst)[4*i] = ((uint32_t*)src)[4*i];
+        ((uint32_t*)dst)[4*i+1] = ((uint32_t*)src)[4*i+1];
+        ((uint32_t*)dst)[4*i+2] = ((uint32_t*)src)[4*i+2];
+        ((uint32_t*)dst)[4*i+3] = ((uint32_t*)src)[4*i+3];
+    }
+    if(4*i < totalLength){
+        ((uint32_t*)dst)[4*i] = ((uint32_t*)src)[4*i];
+        ((uint32_t*)dst)[4*i+1] = ((uint32_t*)src)[4*i+1];
+        ((uint32_t*)dst)[4*i+2] = ((uint32_t*)src)[4*i+2];
+        ((uint32_t*)dst)[4*i+3] = ((uint32_t*)src)[4*i+3];
+
+    }
+    return nullptr;
+}
+
+__device__ static inline void* memset(void* ptr, uint8_t val, size_t size)
+{
+    uint32_t _val = 0;
+    _val = (val | val << 8 | val << 16 | val << 24);
+    uint64_t totalLength = size/sizeof(uint32_t);
+    uint64_t i = 0;
+    for(i=hipThreadIdx_x+hipBlockIdx_x*hipBlockDim_x;
+                  i<(totalLength/4);
+                  i = i + hipBlockDim_x * hipGridDim_x)
+    {
+        ((uint32_t*)ptr)[4*i] = _val;
+        ((uint32_t*)ptr)[4*i+1] = _val;
+        ((uint32_t*)ptr)[4*i+2] = _val;
+        ((uint32_t*)ptr)[4*i+3] = _val;
+    }
+    if(4*i < totalLength){
+        ((uint32_t*)ptr)[4*i] = _val;
+        ((uint32_t*)ptr)[4*i+1] = _val;
+        ((uint32_t*)ptr)[4*i+2] = _val;
+        ((uint32_t*)ptr)[4*i+3] = _val;
+
+    }
+    return nullptr;
+}
+
 #define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE)
 
 #define HIP_KERNEL_NAME(...) __VA_ARGS__
@@ -653,4 +702,6 @@ do {\
  */
 
 
+
+
 #endif
diff --git a/tests/src/hipDeviceMemcpy.cpp b/tests/src/hipDeviceMemcpy.cpp
new file mode 100644
index 0000000000..7c1cc59808
--- /dev/null
+++ b/tests/src/hipDeviceMemcpy.cpp
@@ -0,0 +1,42 @@
+#include<iostream>
+#include<hip/hip_runtime.h>
+#include<hip/hip_runtime_api.h>
+
+#define LEN 1030
+#define SIZE LEN << 2
+
+__global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In, uint32_t *Vald)
+{
+    memcpy(Out, In, SIZE, Vald);
+}
+
+__global__ void set(hipLaunchParm lp, uint32_t *ptr, uint8_t val, size_t size)
+{
+    memset(ptr, val, size);
+}
+
+int main()
+{
+    uint32_t *A, *Ad, *B, *Bd;
+    uint32_t *Val, *Vald;
+    A = new uint32_t[LEN];
+    B = new uint32_t[LEN];
+    Val = new uint32_t;
+    *Val = 0;
+    for(int i=0;i<LEN;i++){
+        A[i] = i *1.0f;
+        B[i] = 0.0f;
+    }
+    hipMalloc((void**)&Ad, SIZE);
+    hipMalloc((void**)&Bd, SIZE);
+    hipMalloc((void**)&Vald, sizeof(uint32_t));
+    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
+    hipLaunchKernel(cpy, dim3(1), dim3(LEN/4), 0, 0, Bd, Ad, Vald);
+    hipLaunchKernel(set, dim3(1), dim3(LEN/4), 0, 0, Bd, 0x1, SIZE);
+    hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
+    hipMemcpy(Val, Vald, sizeof(uint32_t), hipMemcpyDeviceToHost);
+    for(int i=LEN-16;i<LEN;i++){
+        std::cout<<A[i]<<" "<<B[i]<<std::endl;
+    }
+    std::cout<<*Val<<std::endl;
+}

From 685601f27cb1a396c9c6ca62d30c6dbb0e9dcbf9 Mon Sep 17 00:00:00 2001
From: Ben Sander <ben.sander@amd.com>
Date: Sun, 28 Aug 2016 16:34:34 -0500
Subject: [PATCH 25/56] Remove deprecated message from hipLaunchModuleKernel

Change-Id: I87675453ae4363e3340a9d1491bb00543fa8c6e0
---
 include/hcc_detail/hip_runtime_api.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/hcc_detail/hip_runtime_api.h b/include/hcc_detail/hip_runtime_api.h
index 7f79b64ac9..1e56949c08 100644
--- a/include/hcc_detail/hip_runtime_api.h
+++ b/include/hcc_detail/hip_runtime_api.h
@@ -1162,7 +1162,7 @@ hipError_t hipLaunchModuleKernel(hipFunction_t f,
                               unsigned int sharedMemBytes,
                               hipStream_t stream,
                               void **kernelParams,
-                              void **extra) __attribute__((deprecated("kernelParams is not fully supported, use extra instead"))) ;
+                              void **extra) ;
 
 // doxygen end Version Management
 /**

From 5551b6c2f6181fafbd41c901ebfdc1088d018d99 Mon Sep 17 00:00:00 2001
From: Ben Sander <ben.sander@amd.com>
Date: Sun, 28 Aug 2016 16:35:27 -0500
Subject: [PATCH 26/56] Remove cutnpaste check on uninitialized hsa status

Change-Id: Icc0256b9b1bd01d45cca4feb1106cfe7427bcd15
---
 src/hip_module.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/hip_module.cpp b/src/hip_module.cpp
index ba9e94d9fb..3923817e95 100644
--- a/src/hip_module.cpp
+++ b/src/hip_module.cpp
@@ -188,10 +188,6 @@ hipError_t ihipModuleGetFunction(hipFunction_t *func, hipModule_t hmod, const ch
         hsa_agent_t gpuAgent = (hsa_agent_t)currentDevice->_hsaAgent;
 
         hsa_status_t status;
-        if(status != HSA_STATUS_SUCCESS){
-            return hipErrorNotInitialized;
-        }
-
         status = hsa_executable_load_code_object(hmod->executable, gpuAgent, hmod->object, NULL);
         if(status != HSA_STATUS_SUCCESS){
             return hipErrorNotInitialized;

From 3d1bf4dbaf08c5f1cb16de192b52b6da717dc7a3 Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Sun, 28 Aug 2016 16:48:57 -0500
Subject: [PATCH 27/56] Cleaned up module api - Moved HIP_INIT_API from
 internal to all public apis - renamed hipLaunchModuleKernel to
 hipModuleLaunchKernel - Changed tests according to the changed api

Change-Id: I822ff63c7c5b7dad340ece49456baf9d89428e9f
---
 include/hcc_detail/hip_runtime_api.h |  2 +-
 src/hip_module.cpp                   | 16 ++++++++--------
 tests/src/hipModule.cpp              |  2 +-
 tests/src/hipModuleUnload.cpp        |  2 +-
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/include/hcc_detail/hip_runtime_api.h b/include/hcc_detail/hip_runtime_api.h
index 1e56949c08..73f3a5b8b2 100644
--- a/include/hcc_detail/hip_runtime_api.h
+++ b/include/hcc_detail/hip_runtime_api.h
@@ -1152,7 +1152,7 @@ hipError_t hipModuleGetGlobal(hipDeviceptr *dptr, size_t *bytes, hipModule_t hmo
 
 hipError_t hipModuleLoadData(hipModule_t *module, const void *image);
 
-hipError_t hipLaunchModuleKernel(hipFunction_t f,
+hipError_t hipModuleLaunchKernel(hipFunction_t f,
                               unsigned int gridDimX,
                               unsigned int gridDimY,
                               unsigned int gridDimZ,
diff --git a/src/hip_module.cpp b/src/hip_module.cpp
index 3923817e95..d4810bed63 100644
--- a/src/hip_module.cpp
+++ b/src/hip_module.cpp
@@ -170,7 +170,6 @@ hipError_t hipModuleUnload(hipModule_t hmod){
 }
 
 hipError_t ihipModuleGetFunction(hipFunction_t *func, hipModule_t hmod, const char *name){
-    HIP_INIT_API(name);
     auto ctx = ihipGetTlsDefaultCtx();
     hipError_t ret = hipSuccess;
 
@@ -206,20 +205,17 @@ hipError_t ihipModuleGetFunction(hipFunction_t *func, hipModule_t hmod, const ch
         if(status != HSA_STATUS_SUCCESS){
             return hipErrorNotFound;
         }
-
-
     }
-
     return ret;
 }
 
 hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod,
-                                const char *name)
-{
+                                const char *name){
+    HIP_INIT_API(name);
     return ihipModuleGetFunction(hfunc, hmod, name);
 }
 
-hipError_t hipLaunchModuleKernel(hipFunction_t f,
+hipError_t hipModuleLaunchKernel(hipFunction_t f,
             uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ,
             uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
             uint32_t sharedMemBytes, hipStream_t hStream,
@@ -286,6 +282,7 @@ Kernel argument preparation.
 
 hipError_t hipModuleGetGlobal(hipDeviceptr *dptr, size_t *bytes,
                               hipModule_t hmod, const char* name){
+    HIP_INIT_API(name);
     hipError_t ret = hipSuccess;
     if(dptr == NULL || bytes == NULL){
         return hipErrorInvalidValue;
@@ -303,7 +300,8 @@ hipError_t hipModuleGetGlobal(hipDeviceptr *dptr, size_t *bytes,
 }
 
 hipError_t hipModuleLoadData(hipModule_t *module, const void *image){
-    hipError_t ret;
+    HIP_INIT_API(image);
+    hipError_t ret = hipSuccess;
     if(image == NULL || module == NULL){
         return hipErrorNotInitialized;
     }else{
@@ -345,3 +343,5 @@ hipError_t hipModuleLoadData(hipModule_t *module, const void *image){
     }
     return ret;
 }
+
+
diff --git a/tests/src/hipModule.cpp b/tests/src/hipModule.cpp
index 94daa2fc5a..86ccb3ab9f 100644
--- a/tests/src/hipModule.cpp
+++ b/tests/src/hipModule.cpp
@@ -71,7 +71,7 @@ int main(){
       HIP_LAUNCH_PARAM_END
     };
 
-    hipLaunchModuleKernel(Function, 1, 1, 1, LEN, 1, 1, 0, stream, NULL, (void**)&config); 
+    hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, stream, NULL, (void**)&config); 
 
     hipStreamDestroy(stream);
 
diff --git a/tests/src/hipModuleUnload.cpp b/tests/src/hipModuleUnload.cpp
index fd23a4717e..2e37414c2c 100644
--- a/tests/src/hipModuleUnload.cpp
+++ b/tests/src/hipModuleUnload.cpp
@@ -24,7 +24,7 @@ THE SOFTWARE.
 #define fileName "vcpy_isa.co"
 
 int main(){
-    hipModule module;
+    hipModule_t module;
     hipModuleLoad(&module, fileName);
     hipModuleUnload(module);
 }

From 9e21549139378606e2641cb055edc3966ba8aebb Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Sun, 28 Aug 2016 22:50:44 -0500
Subject: [PATCH 28/56] Fixed nvcc path for module apis

Change-Id: I43c7a6bc226f4f270c37f3c4af86b3b3efd0b175
---
 include/nvcc_detail/hip_runtime_api.h | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/include/nvcc_detail/hip_runtime_api.h b/include/nvcc_detail/hip_runtime_api.h
index d017e37495..f6af0a608e 100644
--- a/include/nvcc_detail/hip_runtime_api.h
+++ b/include/nvcc_detail/hip_runtime_api.h
@@ -585,6 +585,16 @@ inline static hipError_t  hipCtxGetFlags ( unsigned int* flags )
     return hipCUResultTohipError(cuCtxGetFlags ( flags ));
 }
 
+inline static hipError_t hipModuleLoad(hipModule_t *module, const char* fname)
+{
+    return hipCUResultTohipError(cuModuleLoad(module, fname));
+}
+
+inline static hipError_t hipModuleUnload(hipModule_t hmod)
+{
+    return hipCUResultTohipError(cuModuleUnload(hmod));
+}
+
 inline static hipError_t hipModuleGetFunction(hipFunction_t *function,
                          hipModule_t module, const char *kname)
 {
@@ -597,19 +607,21 @@ inline static hipError_t hipModuleGetGlobal(hipDeviceptr *dptr, size_t *bytes,
     return hipCUResultTohipError(cuModuleGetGlobal(dptr, bytes, hmod, name));
 }
 
-inline static hipError_t hipModuleLoad(hipModule_t *module, const char* fname)
-{
-    return hipCUResultTohipError(cuModuleLoad(module, fname));
-}
-
 inline static hipError_t hipModuleLoadData(hipModule_t *module, const void *image)
 {
     return hipCUResultTohipError(cuModuleLoadData(module, image));
 }
 
-inline static hipError_t hipModuleUnload(hipModule_t hmod)
+inline static hipError_t hipModuleLaunchKernel(hipFunction_t f,
+      unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,
+      unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
+      unsigned int sharedMemBytes, hipStream_t stream,
+      void **kernelParams, void **extra)
 {
-    return hipCUResultTohipError(cuModuleUnload(hmod));
+    return hipCUResultTohipError(cuLaunchKernel(f, 
+                    gridDimX, gridDimY, gridDimZ,
+                    blockDimX, blockDimY, blockDimZ,
+                    shreadMemBytes, stream, kernelParams, extra);
 }
 
 #ifdef __cplusplus

From 99727231a35c6c4ec5201d3eb76dac8760c8dfbc Mon Sep 17 00:00:00 2001
From: Ben Sander <ben.sander@amd.com>
Date: Mon, 29 Aug 2016 08:04:08 -0500
Subject: [PATCH 29/56] Doc update.  Describe memcpytosymbol,
 threadfence_system workarounds

---
 docs/markdown/hip_faq.md           | 11 ++++---
 docs/markdown/hip_porting_guide.md | 46 ++++++++++++++++++++++++++++--
 2 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/docs/markdown/hip_faq.md b/docs/markdown/hip_faq.md
index cca0965ce2..d62f40510a 100644
--- a/docs/markdown/hip_faq.md
+++ b/docs/markdown/hip_faq.md
@@ -32,10 +32,12 @@
 HIP provides the following:
 - Devices (hipSetDevice(), hipGetDeviceProperties(), etc.)
 - Memory management (hipMalloc(), hipMemcpy(), hipFree(), etc.)
-- Streams (hipStreamCreate(), etc.)
+- Streams (hipStreamCreate(),hipStreamSynchronize(), hipStreamWaitEvent(),  etc.)
 - Events (hipEventRecord(), hipEventElapsedTime(), etc.)
 - Kernel launching (hipLaunchKernel is a standard C/C++ function that replaces <<< >>>)
+- HIP Module API to control when adn how code is loaded.
 - CUDA-style kernel coordinate functions (threadIdx, blockIdx, blockDim, gridDim)
+- Cross-lane instructions including shfl, ballot, any, all
 - Most device-side math built-ins
 - Error reporting (hipGetLastError(), hipGetErrorString())
 
@@ -53,6 +55,7 @@ At a high-level, the following features are not supported:
 
 - CUDA array, mipmappedArray and pitched memory
 - MemcpyToSymbol functions
+- Queue priority controls
 
 See the [API Support Table](CUDA_Runtime_API_functions_supported_by_HIP.md) for more detailed information.
 
@@ -60,10 +63,10 @@ See the [API Support Table](CUDA_Runtime_API_functions_supported_by_HIP.md) for
 - Device-side dynamic memory allocations (malloc, free, new, delete) (CUDA 4.0)
 - Virtual functions, indirect functions and try/catch (CUDA 4.0)
 - `__prof_trigger` 
-- PTX assembly (CUDA 4.0)
+- PTX assembly (CUDA 4.0).  HCC supports inline GCN assembly.
 - Several kernel features are under development.  See the [HIP Kernel Language](hip_kernel_language.md) for more information.  These include:
   - printf
-  - assert__
+  - assert
   - `__restrict__`
   - `__launch_bounds__`
   - `__threadfence*_`, `__syncthreads*`
@@ -74,7 +77,7 @@ See the [API Support Table](CUDA_Runtime_API_functions_supported_by_HIP.md) for
 ### Is HIP a drop-in replacement for CUDA?
 No. HIP provides porting tools which do most of the work do convert CUDA code into portable C++ code that uses the HIP APIs.
 Most developers will port their code from CUDA to HIP and then maintain the HIP version. 
-HIP code provides the same performance as coding in native CUDA, plus the benefit that the code can also run on AMD platforms.
+HIP code provides the same performance as native CUDA code, plus the benefits of running on AMD platforms.
 
 ### What specific version of CUDA does HIP support?
 HIP APIs and features do not map to a specific CUDA version.  HIP provides a strong subset of functionality provided in CUDA, and the hipify tools can 
diff --git a/docs/markdown/hip_porting_guide.md b/docs/markdown/hip_porting_guide.md
index 4d376f4a5f..45e5455ea4 100644
--- a/docs/markdown/hip_porting_guide.md
+++ b/docs/markdown/hip_porting_guide.md
@@ -395,13 +395,53 @@ For new projects or ports which can be re-factored, we recommend the use of the
 This indicates that the code is standard C++ code, but also provides a unique indication for make tools to
 run hipcc when appropriate.
 
-### Workarounds
+## Workarounds
 
-#### warpSize
+### warpSize
 Code should not assume a warp size of 32 or 64.  See [Warp Cross-Lane Functions](hip_kernel_language.md#warp-cross-lane-functions) for information on how to write portable wave-aware code.
 
+## memcpyToSymbol
 
-#### Textures and Cache Control
+HIP support for hipMemCpyToSymbol is under-development.  This feature allows a kernel
+to define a device-side data symbol which can be accessed on the host side.  The symbol
+can be in __constant or device space.  As a workaround, programs can pass the symbol
+as an argument to the kernel, and use standard hipMemcpy routines to initialize it.
+
+For example:
+
+Device Code:
+```
+// Cuda Device Code
+__constant__ float Array[1024];
+__global__ void Inc(float *Out){
+   Int tx = hipThreadIdx_x;
+   Out[tx] = Array[tx] + 1;
+}
+ 
+// HIP Device Code
+__global__ void Inc(hipLaunchParm lp, float *Array, float *Out){
+   Int tx = hipThreadIdx_x;
+   Out[tx] = Array[tx] + 1;
+}
+```
+ 
+Host Code:
+```
+// CUDA Host Code
+cudaMemcpyToSymbol(Array, hostArray, sizeofArray);
+ 
+// HIP Host Code
+hipMemcpy(Array, hostArray, sizeofArray);
+```
+ 
+## threadfence_system
+Threadfence_system makes all device memory writes, all writes to mapped host memory, and all writes to peer memory visible to CPU and other GPU devices.
+Some implementations can provide this behavior by flushing the GPU L2 cache.
+HIP/HCC does not provide this functionality.  As a workaround, users can set the environment variable `HSA_DISABLE_CACHE=1` to 
+disable the GPU L2 cache. This will affect all accesses and for all kernels and so may have 
+a performance impact.
+
+### Textures and Cache Control
 
 >Texture support is under-development and not yet supported by HIP.
 

From 7ae453f349621e6f3be99676723bd777c61646a4 Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Mon, 29 Aug 2016 15:00:26 -0500
Subject: [PATCH 30/56] added sample for module api which works on both cuda
 and rocm

Change-Id: Id88abb3698a224177fd0531e3e16013dde6dd95e
---
 tests/src/sampleModule.cpp | 94 ++++++++++++++++++++++++++++++++++++++
 tests/src/vcpy_isa.cu      |  6 +++
 tests/src/vcpy_isa.ptx     | 38 +++++++++++++++
 3 files changed, 138 insertions(+)
 create mode 100644 tests/src/sampleModule.cpp
 create mode 100644 tests/src/vcpy_isa.cu
 create mode 100644 tests/src/vcpy_isa.ptx

diff --git a/tests/src/sampleModule.cpp b/tests/src/sampleModule.cpp
new file mode 100644
index 0000000000..bef7345b4f
--- /dev/null
+++ b/tests/src/sampleModule.cpp
@@ -0,0 +1,94 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include<hip_runtime.h>
+#include<hip_runtime_api.h>
+#include<iostream>
+#include<fstream>
+#include<vector>
+
+#define LEN 64
+#define SIZE LEN<<2
+
+#ifdef __HIP_PLATFORM_HCC__ 
+#define fileName "vcpy_isa.co"
+#endif
+
+#ifdef __HIP_PLATFORM_NVCC__
+#define fileName "vcpy_isa.ptx"
+#endif
+
+#define kernel_name "hello_world"
+
+int main(){
+    float *A, *B;
+    hipDeviceptr_t Ad, Bd;
+    A = new float[LEN];
+    B = new float[LEN];
+
+    for(uint32_t i=0;i<LEN;i++){
+        A[i] = i*1.0f;
+        B[i] = 0.0f;
+        std::cout<<A[i] << " "<<B[i]<<std::endl;
+    }
+
+
+#ifdef __HIP_PLATFORM_NVCC__
+  	cuInit(0);
+	  hipDevice_t device;
+	  hipCtx_t context;
+	  hipDeviceGet(&device, 0);
+	  hipCtxCreate(&context, 0, device);
+#endif
+
+    hipMalloc((void**)&Ad, SIZE);
+    hipMalloc((void**)&Bd, SIZE);
+
+    hipMemcpyHtoD(Ad, A, SIZE);
+    hipMemcpyHtoD(Bd, B, SIZE);
+    hipModule_t Module;
+    hipFunction_t Function;
+    hipModuleLoad(&Module, fileName);
+    hipModuleGetFunction(&Function, Module, kernel_name);
+
+    std::vector<void*>argBuffer(2);
+    memcpy(&argBuffer[0], &Ad, sizeof(void*));
+    memcpy(&argBuffer[1], &Bd, sizeof(void*));
+
+    size_t size = argBuffer.size()*sizeof(void*);
+
+    void *config[] = {
+      HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0],
+      HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
+      HIP_LAUNCH_PARAM_END
+    };
+
+    hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&config); 
+
+    hipMemcpyDtoH(B, Bd, SIZE);
+    for(uint32_t i=0;i<LEN;i++){
+        std::cout<<A[i]<<" - "<<B[i]<<std::endl;
+    }
+
+#ifdef __HIP_PLATFORM_NVCC__
+	  hipCtxDetach(context);
+#endif 
+
+    return 0;
+}
diff --git a/tests/src/vcpy_isa.cu b/tests/src/vcpy_isa.cu
new file mode 100644
index 0000000000..d2a0838604
--- /dev/null
+++ b/tests/src/vcpy_isa.cu
@@ -0,0 +1,6 @@
+
+extern "C" __global__ void hello_world(float *a, float *b)
+{
+	int tx = threadIdx.x;
+	b[tx] = a[tx];
+}
diff --git a/tests/src/vcpy_isa.ptx b/tests/src/vcpy_isa.ptx
new file mode 100644
index 0000000000..62eb3f63df
--- /dev/null
+++ b/tests/src/vcpy_isa.ptx
@@ -0,0 +1,38 @@
+//
+// Generated by NVIDIA NVVM Compiler
+//
+// Compiler Build ID: CL-19856038
+// Cuda compilation tools, release 7.5, V7.5.17
+// Based on LLVM 3.4svn
+//
+
+.version 4.3
+.target sm_20
+.address_size 64
+
+	// .globl	hello_world
+
+.visible .entry hello_world(
+	.param .u64 hello_world_param_0,
+	.param .u64 hello_world_param_1
+)
+{
+	.reg .f32 	%f<2>;
+	.reg .b32 	%r<2>;
+	.reg .b64 	%rd<8>;
+
+
+	ld.param.u64 	%rd1, [hello_world_param_0];
+	ld.param.u64 	%rd2, [hello_world_param_1];
+	cvta.to.global.u64 	%rd3, %rd2;
+	cvta.to.global.u64 	%rd4, %rd1;
+	mov.u32 	%r1, %tid.x;
+	mul.wide.s32 	%rd5, %r1, 4;
+	add.s64 	%rd6, %rd4, %rd5;
+	ld.global.f32 	%f1, [%rd6];
+	add.s64 	%rd7, %rd3, %rd5;
+	st.global.f32 	[%rd7], %f1;
+	ret;
+}
+
+

From 4b377f63d0656fac2d362890a0bf277db9e2b4ef Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Mon, 29 Aug 2016 15:05:12 -0500
Subject: [PATCH 31/56] Changed module api to work with nvcc and hipcc path -
 Added cuda and cudart linking for nvcc path in hipcc - Added hipMemcpyHtoD
 and hipMemcpyDtoH for nvcc path - Changed hipDeviceptr to hipDeviceptr_t -
 Fixed hipMemcpy*to* API signatues

Change-Id: I6ef076b507f92502efda475c83dcdcdd462afc37
---
 bin/hipcc                             |  2 +-
 include/hcc_detail/hip_runtime_api.h  | 12 ++++-----
 include/nvcc_detail/hip_runtime_api.h | 38 ++++++++++++++++++++++-----
 src/hip_memory.cpp                    |  8 +++---
 src/hip_module.cpp                    |  2 +-
 5 files changed, 44 insertions(+), 18 deletions(-)

diff --git a/bin/hipcc b/bin/hipcc
index c37a704f4c..e91993b460 100755
--- a/bin/hipcc
+++ b/bin/hipcc
@@ -151,7 +151,7 @@ if ($HIP_PLATFORM eq "hcc") {
     $HIPCC="$CUDA_PATH/bin/nvcc";
     $HIPCXXFLAGS .= " -I$CUDA_PATH/include";
 
-    $HIPLDFLAGS = "";
+    $HIPLDFLAGS = "-lcuda -lcudart";
 } else {
     printf ("error: unknown HIP_PLATFORM = '$HIP_PLATFORM'");
     exit (-1);
diff --git a/include/hcc_detail/hip_runtime_api.h b/include/hcc_detail/hip_runtime_api.h
index 73f3a5b8b2..0b3be62c3b 100644
--- a/include/hcc_detail/hip_runtime_api.h
+++ b/include/hcc_detail/hip_runtime_api.h
@@ -56,7 +56,7 @@ typedef struct ihipModule_t *hipModule_t;
 
 typedef struct ihipFunction_t *hipFunction_t;
 
-typedef void* hipDeviceptr;
+typedef void* hipDeviceptr_t;
 
 typedef struct ihipEvent_t *hipEvent_t;
 
@@ -844,13 +844,13 @@ hipError_t hipHostFree(void* ptr);
  */
 hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind);
 
-hipError_t hipMemcpyHtoD(hipDeviceptr dst, hipDeviceptr src, size_t sizeBytes);
+hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t sizeBytes);
 
-hipError_t hipMemcpyDtoH(hipDeviceptr dst, hipDeviceptr src, size_t sizeBytes);
+hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t sizeBytes);
 
-hipError_t hipMemcpyDtoD(hipDeviceptr dst, hipDeviceptr src, size_t sizeBytes);
+hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes);
 
-hipError_t hipMemcpyHtoH(hipDeviceptr dst, hipDeviceptr src, size_t sizeBytes);
+hipError_t hipMemcpyHtoH(void* dst, void* src, size_t sizeBytes);
 
 
 /**
@@ -1148,7 +1148,7 @@ hipError_t hipModuleUnload(hipModule_t module);
 
 hipError_t hipModuleGetFunction(hipFunction_t *function, hipModule_t module, const char *kname);
 
-hipError_t hipModuleGetGlobal(hipDeviceptr *dptr, size_t *bytes, hipModule_t hmod, const char *name);
+hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, hipModule_t hmod, const char *name);
 
 hipError_t hipModuleLoadData(hipModule_t *module, const void *image);
 
diff --git a/include/nvcc_detail/hip_runtime_api.h b/include/nvcc_detail/hip_runtime_api.h
index f6af0a608e..d7f5d521a4 100644
--- a/include/nvcc_detail/hip_runtime_api.h
+++ b/include/nvcc_detail/hip_runtime_api.h
@@ -58,15 +58,19 @@ hipMemcpyHostToHost
 #define hipHostRegisterPortable cudaHostRegisterPortable
 #define hipHostRegisterMapped cudaHostRegisterMapped
 
+#define HIP_LAUNCH_PARAM_BUFFER_POINTER CU_LAUNCH_PARAM_BUFFER_POINTER
+#define HIP_LAUNCH_PARAM_BUFFER_SIZE     CU_LAUNCH_PARAM_BUFFER_SIZE
+#define HIP_LAUNCH_PARAM_END            CU_LAUNCH_PARAM_END
+
 typedef cudaEvent_t hipEvent_t;
 typedef cudaStream_t hipStream_t;
 typedef CUcontext hipCtx_t;
 typedef CUsharedconfig hipSharedMemConfig;
 typedef CUfunc_cache hipFuncCache;
 typedef CUdevice hipDevice_t;
-typedef CUModule hipModule_t;
-typedef CUFunction hipFunction_t;
-typedef CUdeviceptr hipDeviceptr;
+typedef CUmodule hipModule_t;
+typedef CUfunction hipFunction_t;
+typedef CUdeviceptr hipDeviceptr_t;
 
 //typedef cudaChannelFormatDesc hipChannelFormatDesc;
 #define hipChannelFormatDesc cudaChannelFormatDesc
@@ -202,6 +206,19 @@ inline static hipError_t hipHostFree(void* ptr)  {
 inline static hipError_t hipSetDevice(int device) {
     return hipCUDAErrorTohipError(cudaSetDevice(device));
 }
+
+inline static hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, 
+                  void* src, size_t size)
+{
+    return hipCUResultTohipError(cuMemcpyHtoD(dst, src, size));
+}
+
+inline static hipError_t hipMemcpyDtoH(void* dst, 
+                  hipDeviceptr_t src, size_t size)
+{
+    return hipCUResultTohipError(cuMemcpyDtoH(dst, src, size));
+}
+
 inline static hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind copyKind) {
   return hipCUDAErrorTohipError(cudaMemcpy(dst, src, sizeBytes, hipMemcpyKindToCudaMemcpyKind(copyKind)));
 }
@@ -464,7 +481,6 @@ inline static hipError_t hipDriverGetVersion(int *driverVersion)
 	return hipCUDAErrorTohipError(err);
 }
 
-
 inline static hipError_t hipDeviceCanAccessPeer ( int* canAccessPeer, int  device, int  peerDevice )
 {
     return hipCUDAErrorTohipError(cudaDeviceCanAccessPeer(canAccessPeer, device, peerDevice));
@@ -585,6 +601,16 @@ inline static hipError_t  hipCtxGetFlags ( unsigned int* flags )
     return hipCUResultTohipError(cuCtxGetFlags ( flags ));
 }
 
+inline static hipError_t hipCtxDetach(hipCtx_t ctx)
+{
+    return hipCUResultTohipError(cuCtxDetach(ctx));
+}
+
+inline static hipError_t hipDeviceGet(hipDevice_t *device, int ordinal)
+{
+    return hipCUResultTohipError(cuDeviceGet(device, ordinal));
+}
+
 inline static hipError_t hipModuleLoad(hipModule_t *module, const char* fname)
 {
     return hipCUResultTohipError(cuModuleLoad(module, fname));
@@ -601,7 +627,7 @@ inline static hipError_t hipModuleGetFunction(hipFunction_t *function,
     return hipCUResultTohipError(cuModuleGetFunction(function, module, kname));
 }
 
-inline static hipError_t hipModuleGetGlobal(hipDeviceptr *dptr, size_t *bytes,
+inline static hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes,
                          hipModule_t hmod, const char* name)
 {
     return hipCUResultTohipError(cuModuleGetGlobal(dptr, bytes, hmod, name));
@@ -621,7 +647,7 @@ inline static hipError_t hipModuleLaunchKernel(hipFunction_t f,
     return hipCUResultTohipError(cuLaunchKernel(f, 
                     gridDimX, gridDimY, gridDimZ,
                     blockDimX, blockDimY, blockDimZ,
-                    shreadMemBytes, stream, kernelParams, extra);
+                    sharedMemBytes, stream, kernelParams, extra));
 }
 
 #ifdef __cplusplus
diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp
index 50520a4e0e..c72e92fb50 100644
--- a/src/hip_memory.cpp
+++ b/src/hip_memory.cpp
@@ -449,7 +449,7 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind
     return ihipLogStatus(e);
 }
 
-hipError_t hipMemcpyHtoD(hipDeviceptr dst, hipDeviceptr src, size_t sizeBytes)
+hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t sizeBytes)
 {
     HIP_INIT_API(dst, src, sizeBytes);
 
@@ -471,7 +471,7 @@ hipError_t hipMemcpyHtoD(hipDeviceptr dst, hipDeviceptr src, size_t sizeBytes)
 }
 
 
-hipError_t hipMemcpyDtoH(hipDeviceptr dst, hipDeviceptr src, size_t sizeBytes)
+hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t sizeBytes)
 {
     HIP_INIT_API(dst, src, sizeBytes);
 
@@ -492,7 +492,7 @@ hipError_t hipMemcpyDtoH(hipDeviceptr dst, hipDeviceptr src, size_t sizeBytes)
     return ihipLogStatus(e);
 }
 
-hipError_t hipMemcpyDtoD(hipDeviceptr dst, hipDeviceptr src, size_t sizeBytes)
+hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes)
 {
     HIP_INIT_API(dst, src, sizeBytes);
 
@@ -513,7 +513,7 @@ hipError_t hipMemcpyDtoD(hipDeviceptr dst, hipDeviceptr src, size_t sizeBytes)
     return ihipLogStatus(e);
 }
 
-hipError_t hipMemcpyHtoH(hipDeviceptr dst, hipDeviceptr src, size_t sizeBytes)
+hipError_t hipMemcpyHtoH(void* dst, void* src, size_t sizeBytes)
 {
     HIP_INIT_API(dst, src, sizeBytes);
 
diff --git a/src/hip_module.cpp b/src/hip_module.cpp
index d4810bed63..e43cc62829 100644
--- a/src/hip_module.cpp
+++ b/src/hip_module.cpp
@@ -280,7 +280,7 @@ Kernel argument preparation.
 }
 
 
-hipError_t hipModuleGetGlobal(hipDeviceptr *dptr, size_t *bytes,
+hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes,
                               hipModule_t hmod, const char* name){
     HIP_INIT_API(name);
     hipError_t ret = hipSuccess;

From dfc60a83597c054980493569c09692f71f9e5eed Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Mon, 29 Aug 2016 15:20:09 -0500
Subject: [PATCH 32/56] added hipInit for cuInit nvcc path

Change-Id: I594d08d936ac5d06f16e42c63062ac5776b65a0c
---
 include/nvcc_detail/hip_runtime_api.h | 11 +++++++++++
 tests/src/sampleModule.cpp            |  2 +-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/include/nvcc_detail/hip_runtime_api.h b/include/nvcc_detail/hip_runtime_api.h
index d7f5d521a4..9ca3542569 100644
--- a/include/nvcc_detail/hip_runtime_api.h
+++ b/include/nvcc_detail/hip_runtime_api.h
@@ -148,6 +148,11 @@ default:
 }
 }
 
+inline static hipError_t hipInit(unsigned int flags)
+{
+    return hipCUResultTohipError(cuInit(flags));
+}
+
 inline static hipError_t hipDeviceReset() {
     return hipCUDAErrorTohipError(cudaDeviceReset());
 }
@@ -219,6 +224,12 @@ inline static hipError_t hipMemcpyDtoH(void* dst,
     return hipCUResultTohipError(cuMemcpyDtoH(dst, src, size));
 }
 
+inline static hipError_t hipMemcpyDtoD(hipDeviceptr_t dst,
+            hipDeviceptr_t src, size_t size)
+{
+    return hipCUResultTohipError(cuMemcpyDtoD(dst, src, size));
+}
+
 inline static hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind copyKind) {
   return hipCUDAErrorTohipError(cudaMemcpy(dst, src, sizeBytes, hipMemcpyKindToCudaMemcpyKind(copyKind)));
 }
diff --git a/tests/src/sampleModule.cpp b/tests/src/sampleModule.cpp
index bef7345b4f..606b19717d 100644
--- a/tests/src/sampleModule.cpp
+++ b/tests/src/sampleModule.cpp
@@ -50,7 +50,7 @@ int main(){
 
 
 #ifdef __HIP_PLATFORM_NVCC__
-  	cuInit(0);
+  	hipInit(0);
 	  hipDevice_t device;
 	  hipCtx_t context;
 	  hipDeviceGet(&device, 0);

From 874740bd5592bfbaa9aff924398922159ef7f941 Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Mon, 29 Aug 2016 15:25:35 -0500
Subject: [PATCH 33/56] added sample to porting driver guide

Change-Id: Ia2b54311cec617e5e2a162fd31415c8caad4a374
---
 docs/markdown/hip_porting_driver_api.md | 80 +++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/docs/markdown/hip_porting_driver_api.md b/docs/markdown/hip_porting_driver_api.md
index 0fca5771b8..0566e452bd 100644
--- a/docs/markdown/hip_porting_driver_api.md
+++ b/docs/markdown/hip_porting_driver_api.md
@@ -171,3 +171,83 @@ hipFunction k;
 hipModuleGetFunction(&k, module, "myKernel");
 ```
 
+The below sample shows how to use `hipModuleGetFunction`.
+
+```
+#include<hip_runtime.h>
+#include<hip_runtime_api.h>
+#include<iostream>
+#include<fstream>
+#include<vector>
+
+#define LEN 64
+#define SIZE LEN<<2
+
+#ifdef __HIP_PLATFORM_HCC__
+#define fileName "vcpy_isa.co"
+#endif
+
+#ifdef __HIP_PLATFORM_NVCC__
+#define fileName "vcpy_isa.ptx"
+#endif
+
+#define kernel_name "hello_world"
+
+int main(){
+    float *A, *B;
+    hipDeviceptr_t Ad, Bd;
+    A = new float[LEN];
+    B = new float[LEN];
+
+    for(uint32_t i=0;i<LEN;i++){
+        A[i] = i*1.0f;
+        B[i] = 0.0f;
+        std::cout<<A[i] << " "<<B[i]<<std::endl;
+    }
+
+
+#ifdef __HIP_PLATFORM_NVCC__
+          hipInit(0);
+          hipDevice_t device;
+          hipCtx_t context;
+          hipDeviceGet(&device, 0);
+          hipCtxCreate(&context, 0, device);
+#endif
+
+    hipMalloc((void**)&Ad, SIZE);
+    hipMalloc((void**)&Bd, SIZE);
+
+    hipMemcpyHtoD(Ad, A, SIZE);
+    hipMemcpyHtoD(Bd, B, SIZE);
+    hipModule_t Module;
+    hipFunction_t Function;
+    hipModuleLoad(&Module, fileName);
+    hipModuleGetFunction(&Function, Module, kernel_name);
+
+    std::vector<void*>argBuffer(2);
+    memcpy(&argBuffer[0], &Ad, sizeof(void*));
+    memcpy(&argBuffer[1], &Bd, sizeof(void*));
+
+    size_t size = argBuffer.size()*sizeof(void*);
+
+    void *config[] = {
+      HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0],
+      HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
+      HIP_LAUNCH_PARAM_END
+    };
+
+    hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&con                                     fig);
+
+    hipMemcpyDtoH(B, Bd, SIZE);
+    for(uint32_t i=0;i<LEN;i++){
+        std::cout<<A[i]<<" - "<<B[i]<<std::endl;
+    }
+
+#ifdef __HIP_PLATFORM_NVCC__
+          hipCtxDetach(context);
+#endif
+
+    return 0;
+}
+
+```

From 24e5fde444ff273fed2bdd6b176be3ea68d851cb Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Mon, 29 Aug 2016 15:38:04 -0500
Subject: [PATCH 34/56] added docs to how to use hipModuleLaunchKernel

Change-Id: I626241552c69cdae56501371374ca8f2c6776c85
---
 docs/markdown/hip_porting_driver_api.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/markdown/hip_porting_driver_api.md b/docs/markdown/hip_porting_driver_api.md
index 0566e452bd..758b0672c1 100644
--- a/docs/markdown/hip_porting_driver_api.md
+++ b/docs/markdown/hip_porting_driver_api.md
@@ -115,6 +115,9 @@ addresses from a single unified pool.  Thus addresses may be shared between cont
 unlike the original CUDA definition a new context does not create a new address space for
 the device.   
 
+#### Using hipModuleLaunchKernel
+`hipModuleLaunchKernel` is `cuLaunchKernel` in HIP world. It takes the same arguments as `cuLaunchKernel`. The argument `kernelParams` is not fully implemented for HCC. The workaround for it is, to use platform specific macros for each target. Or, `extra` argument can be used which works on both the platforms.
+
 #### Additional Information
 HCC allocates staging buffers (used for unpinned copies) on a per-device basis.
 

From 8fb076f5c449dbc9adc51888e07811b8b99dd871 Mon Sep 17 00:00:00 2001
From: Ben Sander <ben.sander@amd.com>
Date: Mon, 29 Aug 2016 10:56:48 -0500
Subject: [PATCH 35/56] remove stray printf

Change-Id: Ie64778a83dfe684ffaab3c31bc3d09b713f825b9
---
 src/hip_memory.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp
index c72e92fb50..a64ad94e81 100644
--- a/src/hip_memory.cpp
+++ b/src/hip_memory.cpp
@@ -870,7 +870,6 @@ hipError_t hipMemGetInfo  (size_t *free, size_t *total)
             // TODO - replace with kernel-level for reporting free memory:
             size_t deviceMemSize, hostMemSize, userMemSize;
             hc::am_memtracker_sizeinfo(device->_acc, &deviceMemSize, &hostMemSize, &userMemSize);
-            printf ("deviceMemSize=%zu\n", deviceMemSize);
 
             *free =  device->_props.totalGlobalMem - deviceMemSize;
         }

From 21e5c25225ba6f67b8778e376ad71a08cec5965f Mon Sep 17 00:00:00 2001
From: Ben Sander <ben.sander@amd.com>
Date: Mon, 29 Aug 2016 18:37:57 -0500
Subject: [PATCH 36/56] Refactor trace code for hipLaunchKernel.

- Use standard print functions for streams.
- Add HIP_INIT macro, for cases where we want to initialize HIP but not
  log an API (ihipPreKernelLaunch).

Change-Id: If43cf8a363d918bcd3722a2e6a965d4cfa2e03e7
---
 include/hcc_detail/hip_hcc.h     |  8 ++++++--
 include/hcc_detail/hip_runtime.h | 18 +++++++++---------
 src/hip_hcc.cpp                  |  9 ++++++++-
 3 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/include/hcc_detail/hip_hcc.h b/include/hcc_detail/hip_hcc.h
index 449d9bc522..01c36afde4 100644
--- a/include/hcc_detail/hip_hcc.h
+++ b/include/hcc_detail/hip_hcc.h
@@ -160,13 +160,17 @@ class ihipCtx_t;
 #endif
 
 
+// Just initialize the HIP runtime, but don't log any trace information.
+#define HIP_INIT()\
+	std::call_once(hip_initialized, ihipInit);\
+    ihipCtxStackUpdate();
+
 
 // This macro should be called at the beginning of every HIP API.
 // It initialies the hip runtime (exactly once), and
 // generate trace string that can be output to stderr or to ATP file.
 #define HIP_INIT_API(...) \
-	std::call_once(hip_initialized, ihipInit);\
-    ihipCtxStackUpdate();\
+    HIP_INIT()\
     API_TRACE(__VA_ARGS__);
 
 #define ihipLogStatus(hipStatus) \
diff --git a/include/hcc_detail/hip_runtime.h b/include/hcc_detail/hip_runtime.h
index 740e60d0eb..727604d8d8 100644
--- a/include/hcc_detail/hip_runtime.h
+++ b/include/hcc_detail/hip_runtime.h
@@ -622,11 +622,12 @@ __device__ static inline void* memset(void* ptr, uint8_t val, size_t size)
 #define HIP_KERNEL_NAME(...) __VA_ARGS__
 
 #ifdef __HCC_CPP__
-hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm *lp);
-hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, grid_launch_parm *lp);
-hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, grid_launch_parm *lp);
-hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, grid_launch_parm *lp);
-void ihipPostLaunchKernel(hipStream_t stream, grid_launch_parm &lp);
+extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm *lp);
+extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, grid_launch_parm *lp);
+extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, grid_launch_parm *lp);
+extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, grid_launch_parm *lp);
+extern void ihipPrintKernelLaunch(const char *kernelName, const grid_launch_parm *lp, const hipStream_t stream);
+extern void ihipPostLaunchKernel(hipStream_t stream, grid_launch_parm &lp);
 
 // TODO - move to common header file.
 #define KNRM  "\x1B[0m"
@@ -638,10 +639,9 @@ do {\
   grid_launch_parm lp;\
   lp.dynamic_group_mem_bytes = _groupMemBytes; \
   hipStream_t trueStream = (ihipPreLaunchKernel(_stream, _numBlocks3D, _blockDim3D, &lp)); \
-    if (HIP_TRACE_API) {\
-        fprintf(stderr, KGRN "<<hip-api: hipLaunchKernel '%s' gridDim:(%d,%d,%d) groupDim:(%d,%d,%d) groupMem:+%d stream=%p\n" KNRM, \
-                #_kernelName, lp.grid_dim.x, lp.grid_dim.y, lp.grid_dim.z, lp.group_dim.x, lp.group_dim.y, lp.group_dim.z, lp.dynamic_group_mem_bytes, (void*)(_stream));\
-    }\
+  if (HIP_TRACE_API) {\
+      ihipPrintKernelLaunch(#_kernelName, &lp, _stream); \
+  }\
   _kernelName (lp, ##__VA_ARGS__);\
   ihipPostLaunchKernel(trueStream, lp);\
 } while(0)
diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp
index 09cf46f8a7..340d572bec 100644
--- a/src/hip_hcc.cpp
+++ b/src/hip_hcc.cpp
@@ -1314,13 +1314,20 @@ hipStream_t ihipSyncAndResolveStream(hipStream_t stream)
     }
 }
 
+void ihipPrintKernelLaunch(const char *kernelName, const grid_launch_parm *lp, const hipStream_t stream) 
+{
+    std::string streamString = ToString(stream);
+    fprintf(stderr, KGRN "<<hip-api: hipLaunchKernel '%s' gridDim:(%d,%d,%d) groupDim:(%d,%d,%d) groupMem:+%d %s\n" KNRM, \
+            kernelName, lp->grid_dim.x, lp->grid_dim.y, lp->grid_dim.z, lp->group_dim.x, lp->group_dim.y, lp->group_dim.z, 
+            lp->dynamic_group_mem_bytes, streamString.c_str());\
+}
 
 // TODO - data-up to data-down:
 // Called just before a kernel is launched from hipLaunchKernel.
 // Allows runtime to track some information about the stream.
 hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm *lp)
 {
-    HIP_INIT_API(stream, grid, block, lp);
+    HIP_INIT();
     stream = ihipSyncAndResolveStream(stream);
 #if USE_GRID_LAUNCH_20
     lp->grid_dim.x = grid.x;

From 91f0b1214d04ef976231e2d98f37915ef2744f85 Mon Sep 17 00:00:00 2001
From: Ben Sander <ben.sander@amd.com>
Date: Mon, 29 Aug 2016 18:39:50 -0500
Subject: [PATCH 37/56] Update supported API list

Change-Id: Idbc941e4464d200aca7ac3382060edc08ab94ef2
---
 .../CUDA_Runtime_API_functions_supported_by_HIP.md     | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md b/docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md
index 189f172c95..2d589ec415 100644
--- a/docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md
+++ b/docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md
@@ -42,12 +42,12 @@
 |   **CUDA**                                                |   **HIP**                     | **CUDA description**                                                                                                           |
 |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------|
 | `cudaStreamAddCallback`                                   |                               | Add a callback to a compute stream.                                                                                            |
-| `cudaStreamAttachMemAsync`                                |                               | Attach memory to a stream asynchronously.                                                                                      |
+| `cudaStreamAttachMemAsync`                                |                               | Attach managed memory to a stream asynchronously.                                                                                      |
 | `cudaStreamCreate`                                        | `hipStreamCreate`             | Create an asynchronous stream.                                                                                                 |
 | `cudaStreamCreateWithFlags`                               | `hipStreamCreateWithFlags`    | Create an asynchronous stream.                                                                                                 |
 | `cudaStreamCreateWithPriority`                            |                               | Create an asynchronous stream with the specified priority.                                                                     |
 | `cudaStreamDestroy`                                       | `hipStreamDestroy`            | Destroys and cleans up an asynchronous stream.                                                                                 |
-| `cudaStreamGetFlags`                                      |                               | Query the flags of a stream.                                                                                                   |
+| `cudaStreamGetFlags`                                      | `hipStreamGetFlags`           | Query the flags of a stream.                                                                                                   |
 | `cudaStreamGetPriority`                                   |                               | Query the priority of a stream.                                                                                                |
 | `cudaStreamQuery`                                         |                               | Queries an asynchronous stream for completion status.                                                                          |
 | `cudaStreamSynchronize`                                   | `hipStreamSynchronize`        | Waits for stream tasks to complete.                                                                                            |
@@ -100,8 +100,8 @@
 | `cudaHostAlloc`                                           | `hipHostMalloc`                | Allocates page-locked memory on the host.                                                                                      |
 | `cudaHostGetDevicePointer`                                | `hipHostGetDevicePointer`    | Passes back device pointer of mapped host memory allocated by cudaHostAlloc or registered by cudaHostRegister.                 |
 | `cudaHostGetFlags`                                        | `hipHostGetFlags`           | Passes back flags used to allocate pinned host memory allocated by cudaHostAlloc.                                              |
-| `cudaHostRegister`                                        |                               | Registers an existing host memory range for use by CUDA.                                                                       |
-| `cudaHostUnregister`                                      |                               | Unregisters a memory range that was registered with cudaHostRegister.                                                          |
+| `cudaHostRegister`                                        | `hipHostRegister`             | Registers an existing host memory range for use by CUDA.                                                                       |
+| `cudaHostUnregister`                                      | `hipHostUnregister`           | Unregisters a memory range that was registered with cudaHostRegister.                                                          |
 | `cudaMalloc`                                              | `hipMalloc`                   | Allocate memory on the device.                                                                                                 |
 | `cudaMalloc3D`                                            |                               | Allocates logical 1D, 2D, or 3D memory objects on the device.                                                                  |
 | `cudaMalloc3DArray`                                       |                               | Allocate an array on the device.                                                                                               |
@@ -231,7 +231,7 @@
 
 |   **CUDA**                                                |   **HIP**                     | **CUDA description**                                                                                                           |
 |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------|
-| `cudaBindSurfaceToArra`y                                  |                               | Binds an array to a surface.                                                                                                   |
+| `cudaBindSurfaceToArray`                                  |                               | Binds an array to a surface.                                                                                                   |
 | `cudaBindTexture`                                         |                               | Binds a memory area to a texture.                                                                                              |
 | `cudaBindTexture2D`                                       |                               | Binds a 2D memory area to a texture.                                                                                           |
 | `cudaBindTextureToArray`                                  |                               | Binds an array to a texture.                                                                                                   |

From 02c3e75b29e5c6c7d91cf6ec4b5a759d2eba10f4 Mon Sep 17 00:00:00 2001
From: Ben Sander <ben.sander@amd.com>
Date: Mon, 29 Aug 2016 18:40:23 -0500
Subject: [PATCH 38/56] Add guideline for HIP_INIT_API

Change-Id: Ib8fa1e952f31158e9435dfc37ff23db9fda6fd7e
---
 CONTRIBUTING.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index cd785fe336..060e70e519 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -117,6 +117,10 @@ Differences or limitations of HIP APIs as compared to CUDA APIs should be clearl
     - Keyword TODO refers to a note that should be addressed in long-term.  Could be style issue, software architecture, or known bugs.
     - FIXME refers to a short-term bug that needs to be addressed.
 
+- HIP_INIT_API() should be placed at the start of each top-level HIP API.  This function will make sure the HIP runtime is initialized,
+  and also constructs an appropriate API string for tracing and CodeXL marker tracing.  The arguments to HIP_INIT_API should match
+  those of the parent fucntion.  
+
 
 #### Presubmit Testing:
 Before checking in or submitting a pull request, run all Rodinia tests and ensure pass results match starting point:

From f6f92a6528857819a3027a030c870012c62dc54c Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Mon, 29 Aug 2016 21:24:19 -0500
Subject: [PATCH 39/56] Added doxygen comments for module apis

Change-Id: I1825249bf91efe7d058f9026a82ec47855759c98
---
 include/hcc_detail/hip_runtime_api.h | 65 +++++++++++++++++++++++++++-
 1 file changed, 64 insertions(+), 1 deletion(-)

diff --git a/include/hcc_detail/hip_runtime_api.h b/include/hcc_detail/hip_runtime_api.h
index 0b3be62c3b..adf72d3196 100644
--- a/include/hcc_detail/hip_runtime_api.h
+++ b/include/hcc_detail/hip_runtime_api.h
@@ -1141,17 +1141,80 @@ hipError_t hipDeviceGetFromId(hipDevice_t *device, int deviceId);
  */
 hipError_t hipDriverGetVersion(int *driverVersion) ;
 
-
+/**
+ * @brief Loads code object from file into a hipModule_t
+ *
+ * @param [in] fname
+ * @param [out] module
+ * 
+ * @returns hipSuccess, hipErrorInvalidValue, hipErrorInvalidContext, hipErrorFileNotFound, hipErrorOutOfMemory, hipErrorSharedObjectInitFailed, hipErrorNotInitialized
+ *
+ * 
+ */
 hipError_t hipModuleLoad(hipModule_t *module, const char *fname);
 
+/**
+ * @brief Freeing the module
+ *
+ * @param [in] module
+ *
+ * @returns hipSuccess, hipInvalidValue
+ * module is freed and the code objects associated with it are destroyed
+ * 
+ */
+
 hipError_t hipModuleUnload(hipModule_t module);
 
+/**
+ * @brief Function with kname will be extracted present in module
+ *
+ * @param [in] module
+ * @param [in] kname
+ * @param [out] function
+ *
+ * @returns hipSuccess, hipErrorInvalidValue, hipErrorInvalidContext, hipErrorNotInitialized, hipErrorNotFound, 
+ */
 hipError_t hipModuleGetFunction(hipFunction_t *function, hipModule_t module, const char *kname);
 
+/**
+ * @brief returns device memory pointer and size of the kernel present in the module with symbol - name
+ *
+ * @param [in] moodule
+ * @param [in] name
+ * @param [out] dptr
+ * @param [out[ bytes
+ *
+ * @returns hipSuccess, hipErrorInvalidValue, hipErrorNotInitialized
+ */
 hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, hipModule_t hmod, const char *name);
 
+/**
+ * @brief builds module from code object which resides in host memory. And image is pointer to that location.
+ *
+ * @param [in] image
+ * @param [out] module
+ *
+ * @returns hipSuccess, hipErrorNotInitialized, hipErrorOutOfMemory, hipErrorNotInitialized
+ */
 hipError_t hipModuleLoadData(hipModule_t *module, const void *image);
 
+/**
+ * @brief launches kernel f with launch parameters and shared memory on stream with arguments passed to kerneelparams or extra
+ *
+ * @param [in[ f
+ * @param [in] gridDimX
+ * @param [in] gridDimY
+ * @param [in] gridDimZ
+ * @param [in] blockDimX
+ * @param [in] blockDimY
+ * @param [in] blockDimZ
+ * @param [in] sharedMemBytes
+ * @param [in] stream
+ * @param [in] kernelParams
+ * @param [in] extraa
+ *
+ * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
+ */
 hipError_t hipModuleLaunchKernel(hipFunction_t f,
                               unsigned int gridDimX,
                               unsigned int gridDimY,

From 5178bd6308fdae57dc1fc1fb191a53acfb6bdcc5 Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Mon, 29 Aug 2016 21:36:34 -0500
Subject: [PATCH 40/56] Added HIP_INIT_API for all context apis

Change-Id: I3dd8c068236d9572659a58683fdb59d2b9dccaf3
---
 src/hip_context.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/hip_context.cpp b/src/hip_context.cpp
index 1caa3b4e42..f1d9ff8387 100644
--- a/src/hip_context.cpp
+++ b/src/hip_context.cpp
@@ -101,6 +101,7 @@ hipError_t hipDriverGetVersion(int *driverVersion)
 
 hipError_t hipCtxDestroy(hipCtx_t ctx)
 {
+    HIP_INIT_API(ctx);
     hipError_t e = hipSuccess;
     ihipCtx_t* currentCtx= ihipGetTlsDefaultCtx();
     ihipCtx_t* primaryCtx= ((ihipDevice_t*)ctx->getDevice())->_primaryCtx;
@@ -122,6 +123,7 @@ hipError_t hipCtxDestroy(hipCtx_t ctx)
 
 hipError_t hipCtxPopCurrent(hipCtx_t* ctx)
 {
+    HIP_INIT_API(ctx);
     hipError_t e = hipSuccess;
     ihipCtx_t* tempCtx;
     *ctx = ihipGetTlsDefaultCtx();
@@ -141,6 +143,7 @@ hipError_t hipCtxPopCurrent(hipCtx_t* ctx)
 
 hipError_t hipCtxPushCurrent(hipCtx_t ctx)
 {
+    HIP_INIT_API(ctx);
     hipError_t e = hipSuccess;
     if(ctx != NULL) {    //TODO- is this check needed?
         ihipSetTlsDefaultCtx(ctx);
@@ -154,6 +157,7 @@ hipError_t hipCtxPushCurrent(hipCtx_t ctx)
 
 hipError_t hipCtxGetCurrent(hipCtx_t* ctx)
 {
+    HIP_INIT_API(ctx);
     hipError_t e = hipSuccess;
 #if 0
     *ctx = ihipGetTlsDefaultCtx();
@@ -172,6 +176,7 @@ hipError_t hipCtxGetCurrent(hipCtx_t* ctx)
 
 hipError_t hipCtxSetCurrent(hipCtx_t ctx)
 {
+    HIP_INIT_API(ctx);
     hipError_t e = hipSuccess;
     if(ctx == NULL) {
         tls_ctxStack.pop();
@@ -185,6 +190,7 @@ hipError_t hipCtxSetCurrent(hipCtx_t ctx)
 
 hipError_t hipCtxGetDevice(hipDevice_t *device)
 {
+    HIP_INIT_API(device);
     hipError_t e = hipSuccess;
 
     ihipCtx_t *ctx = ihipGetTlsDefaultCtx();
@@ -247,11 +253,13 @@ hipError_t hipCtxGetSharedMemConfig ( hipSharedMemConfig * pConfig )
 
 hipError_t hipCtxSynchronize ( void )
 {
+    HIP_INIT_API(1);
     return ihipSynchronize(); //TODP Shall check validity of ctx?
 }
 
 hipError_t hipCtxGetFlags ( unsigned int* flags )
 {
+    HIP_INIT_API(flags);
     hipError_t e = hipSuccess;
     ihipCtx_t* tempCtx;
     tempCtx = ihipGetTlsDefaultCtx();

From 1769c4b4b2e1e15cd8f377c02471e42635484323 Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Mon, 29 Aug 2016 21:42:22 -0500
Subject: [PATCH 41/56] remove HIP_INIT_API from ihipSynchronize

Change-Id: Ibe0739efe55573c023212d9c28ba847c777e434c
---
 src/hip_hcc.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp
index 340d572bec..ea8604b7c4 100644
--- a/src/hip_hcc.cpp
+++ b/src/hip_hcc.cpp
@@ -160,8 +160,6 @@ ihipCtx_t *ihipGetTlsDefaultCtx()
 
 hipError_t ihipSynchronize(void)
 {
-    HIP_INIT_API();
-
     ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits for all activity to finish.
 
     return ihipLogStatus(hipSuccess);

From 310f0660aa135b2bc7dcfe79695dc3a247255f02 Mon Sep 17 00:00:00 2001
From: Rahul Garg <rahul.garg@amd.com>
Date: Tue, 30 Aug 2016 15:55:51 +0530
Subject: [PATCH 42/56] CUresult to hipError mapping updates

Change-Id: I602a70acda798a47ebbbef84b438b21c399821c3
---
 include/nvcc_detail/hip_runtime_api.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/nvcc_detail/hip_runtime_api.h b/include/nvcc_detail/hip_runtime_api.h
index 9ca3542569..baeb080195 100644
--- a/include/nvcc_detail/hip_runtime_api.h
+++ b/include/nvcc_detail/hip_runtime_api.h
@@ -103,8 +103,10 @@ switch(cuError) {
     case CUDA_ERROR_OUT_OF_MEMORY                : return hipErrorMemoryAllocation            ;
     case CUDA_ERROR_INVALID_VALUE                : return hipErrorInvalidValue                ;
     case CUDA_ERROR_INVALID_DEVICE               : return hipErrorInvalidDevice               ;
-    case CUDA_ERROR_DEINITIALIZED            : return hipErrorInitializationError         ;
-    case CUDA_ERROR_NO_DEVICE                  : return hipErrorNoDevice                    ;
+    case CUDA_ERROR_DEINITIALIZED                : return hipErrorDeinitialized               ;
+    case CUDA_ERROR_NO_DEVICE                    : return hipErrorNoDevice                    ;
+    case CUDA_ERROR_INVALID_CONTEXT              : return hipErrorInvalidContext              ;
+    case CUDA_ERROR_NOT_INITIALIZED              : return hipErrorNotInitialized              ;
     default                                      : return hipErrorUnknown;  // Note - translated error.
 };
 }

From 69fb9ee3aa00c05bcf4cafee246a1cab8fce2e46 Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Wed, 31 Aug 2016 10:18:46 -0500
Subject: [PATCH 43/56] added doxygen comments for hipModuleLaunchKernel

Change-Id: I8a52d2e62f4b7eea8e05d779b9fda49f0ac45130
---
 include/hcc_detail/hip_runtime_api.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/hcc_detail/hip_runtime_api.h b/include/hcc_detail/hip_runtime_api.h
index adf72d3196..0114a0772b 100644
--- a/include/hcc_detail/hip_runtime_api.h
+++ b/include/hcc_detail/hip_runtime_api.h
@@ -1213,7 +1213,10 @@ hipError_t hipModuleLoadData(hipModule_t *module, const void *image);
  * @param [in] kernelParams
  * @param [in] extraa
  *
+ * The function takes the above arguments and run the kernel in hipFunction_t f.  with launch parameters specified in gridDimX, gridDimY, gridDimZ,  blockDimX, blockDimY and blockDimmZ. The amount of shared memory is specificed and can be used with HIP_DYNAMIC_SHARED. The arguemt extra is used to pass in the arguments for the kernel.
  * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
+ *
+ * @warning kernellParams argument is not yet implemented in HIP. Please use extra instead. Please refer to hip_porting_driver_api.md for sample usage.
  */
 hipError_t hipModuleLaunchKernel(hipFunction_t f,
                               unsigned int gridDimX,

From 4a0c6c2d53464931e210ab6ead0e4171a43aaab3 Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Wed, 31 Aug 2016 13:05:57 -0500
Subject: [PATCH 44/56] added kernel only compilation feature for hipcc 1.
 Added hipgenisa.sh file to compile kernel code to hsa code object 2. Changed
 hipcc to call hipgenisa.sh, making hipcc compiling the kernels

Change-Id: I976459c1ebb24343e1b1fe38b4c3a203f1adffa9
---
 bin/hipcc        | 11 +++++++++++
 bin/hipgenisa.sh | 26 ++++++++++++++++++++++++++
 2 files changed, 37 insertions(+)
 create mode 100755 bin/hipgenisa.sh

diff --git a/bin/hipcc b/bin/hipcc
index e91993b460..b92de9e29d 100755
--- a/bin/hipcc
+++ b/bin/hipcc
@@ -157,6 +157,16 @@ if ($HIP_PLATFORM eq "hcc") {
     exit (-1);
 }
 
+my $ISACMD="hipgenisa.sh ";
+$ISACMD .= $ROCM_PATH;
+if($ARGV[0] eq "--genisa"){
+  foreach $isaarg (@ARGV){
+    $ISACMD .= " ";
+    $ISACMD .= $isaarg;
+  }
+  system($ISACMD);
+  exit(-1);
+}
 
 # Add paths to common HIP includes:
 $HIPCXXFLAGS .= " -I$HIP_PATH/include" ;
@@ -178,6 +188,7 @@ if ($verbose & 0x4) {
 }
 
 my $toolArgs = "";  # arguments to pass to the hcc or nvcc tool
+
 foreach $arg (@ARGV)
 {
     my $swallowArg = 0;
diff --git a/bin/hipgenisa.sh b/bin/hipgenisa.sh
new file mode 100755
index 0000000000..ef5417a428
--- /dev/null
+++ b/bin/hipgenisa.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+if [ $1 = " " ]
+then
+exit
+fi
+
+ROCM_PATH=$1
+GEN_ISA=$3
+FILE_NAMES=$4
+OUT=$5
+OUTPUT_FILE=$6
+TARGET=""
+if [ ${GEN_ISA:0:12} = "--target-isa" ]
+then
+  TARGET=${GEN_ISA:13:12}
+fi
+
+export KMDUMPISA=1
+export KMDUMPLLVM=1
+hipcc $FILE_NAMES -o /tmp/hipgenisa/a.out
+mv dump.* /tmp/hipgenisa/
+${ROCM_PATH}/hcc-lc/bin/llvm-mc -arch=amdgcn -mcpu=$TARGET -filetype=obj /tmp/hipgenisa/dump.isa -o /tmp/hipgenisa/dump.o
+${ROCM_PATH}/llvm/bin/clang -target amdgcn--amdhsa /tmp/hipgenisa/dump.o -o $OUTPUT_FILE
+export KMDUMPISA=0
+export KMDUMPLLVM=0

From 569e0b2eed5d8fe3c026c34d99d5f21b101738d9 Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Wed, 31 Aug 2016 13:22:28 -0500
Subject: [PATCH 45/56] added how to use hipcc for kernel compilation

Change-Id: If652316272f21b90516f5a5ed88c17f4f4e77fb0
---
 docs/markdown/hip_kernel_language.md | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/docs/markdown/hip_kernel_language.md b/docs/markdown/hip_kernel_language.md
index e74a010a60..ba52b88433 100644
--- a/docs/markdown/hip_kernel_language.md
+++ b/docs/markdown/hip_kernel_language.md
@@ -667,3 +667,22 @@ The following C++ features are not supported:
 - Run-time-type information (RTTI)
 - Virtual functions
 - Try/catch
+
+## Kernel Compilation
+HIP now supports compiling C++/HIP kernels to binary. Eventhough HIP does not support fatbinary (yet), the user can specify the target for which the binary can be generated. The file format for binary is `.co` which means Code Object. The following command builds the binary using `hipcc`.
+
+`hipcc --genisa --target-isa=[TARGET GPU] [INPUT FILE] -o [OUTPUT FILE]`
+```[TARGET GPU] = fiji/hawaii
+[INPUT FILE] = Name of the file containing kernels
+[OUTPUT FILE] = Name of the generated code object file```
+
+Note that the kernel file should have `int main(){}` at the end it so that the binary is generated. This happens because HCC generates binaries at linking time rather than compilation
+
+You need 3 things to run kernel in binary.
+1. Kernel Binary
+2. Name of kernel binary
+3. Name of the kernel
+
+We already got first two of them. In order to get name of the kernel, try `objdump -x [OUTPUT FILE]`. OUTPUT FILE is file generated by hipcc during kernel compilation. The output from objdump has symbol to the kernel whose name is mangled with `grid_launch_parm`, `__functor`, `__cxxamp_trampoline`. An example of how it looks is `ZN12_GLOBAL__N_137_Z3Cpy16grid_launch_parmPfS0__functor19__cxxamp_trampolineEiiiiiiPKfPf` where `Cpy` is the name of the kernel written in C++.
+
+

From d5a6e22c59f9ebfcc72cfc20b6534420823ab64c Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Wed, 31 Aug 2016 13:56:07 -0500
Subject: [PATCH 46/56] added sample for how-to-use pre-compiled kernels1.
 Corrected the exit output of kernel compilation by hipcc 2. Added sample
 which loads/run kernel binary during runtime?

Change-Id: I26ccaca1f844fee317592e26c9e654ce548b96a8
---
 bin/hipcc                                |   2 +-
 samples/0_Intro/module_api/Makefile      |  19 ++++
 samples/0_Intro/module_api/runKernel.cpp | 105 +++++++++++++++++++++++
 samples/0_Intro/module_api/vcpy_isa.cpp  |   9 ++
 samples/0_Intro/module_api/vcpy_isa.cu   |   6 ++
 samples/0_Intro/module_api/vcpy_isa.ptx  |  38 ++++++++
 6 files changed, 178 insertions(+), 1 deletion(-)
 create mode 100644 samples/0_Intro/module_api/Makefile
 create mode 100644 samples/0_Intro/module_api/runKernel.cpp
 create mode 100644 samples/0_Intro/module_api/vcpy_isa.cpp
 create mode 100644 samples/0_Intro/module_api/vcpy_isa.cu
 create mode 100644 samples/0_Intro/module_api/vcpy_isa.ptx

diff --git a/bin/hipcc b/bin/hipcc
index b92de9e29d..b491f34665 100755
--- a/bin/hipcc
+++ b/bin/hipcc
@@ -165,7 +165,7 @@ if($ARGV[0] eq "--genisa"){
     $ISACMD .= $isaarg;
   }
   system($ISACMD);
-  exit(-1);
+  exit(0);
 }
 
 # Add paths to common HIP includes:
diff --git a/samples/0_Intro/module_api/Makefile b/samples/0_Intro/module_api/Makefile
new file mode 100644
index 0000000000..b58882b0fd
--- /dev/null
+++ b/samples/0_Intro/module_api/Makefile
@@ -0,0 +1,19 @@
+HIP_PATH?= $(wildcard /opt/rocm/hip)
+ifeq (,$(HIP_PATH))
+	HIP_PATH=../../..
+endif
+HIPCC=$(HIP_PATH)/bin/hipcc
+
+all: vcpy_isa.compile runKernel.hip.out
+
+runKernel.cuda.out: runKernel.cpp
+	nvcc runKernel.cpp -o $@
+
+vcpy_isa.compile: vcpy_isa.cpp
+	$(HIPCC) --genisa --target-isa=fiji vcpy_isa.cpp -o vcpy_isa.co	
+
+runKernel.hip.out: runKernel.cpp
+	$(HIPCC) runKernel.cpp -o runKernel.hip.out
+
+clean:
+	rm -f *.co *.out
diff --git a/samples/0_Intro/module_api/runKernel.cpp b/samples/0_Intro/module_api/runKernel.cpp
new file mode 100644
index 0000000000..e4fa1b6d93
--- /dev/null
+++ b/samples/0_Intro/module_api/runKernel.cpp
@@ -0,0 +1,105 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include<hip_runtime.h>
+#include<hip_runtime_api.h>
+#include<iostream>
+#include<fstream>
+#include<vector>
+
+#define LEN 64
+#define SIZE LEN<<2
+
+#ifdef __HIP_PLATFORM_HCC__ 
+#define fileName "vcpy_isa.co"
+#define kernel_name "ZN12_GLOBAL__N_146_Z11hello_world16grid_launch_parmPfS0__functor19__cxxamp_trampolineEiiiiiiPKfPf"
+#endif
+
+#ifdef __HIP_PLATFORM_NVCC__
+#define fileName "vcpy_isa.ptx"
+#define kernel_name "hello_world"
+#endif
+
+int main(){
+    float *A, *B;
+    hipDeviceptr_t Ad, Bd;
+    A = new float[LEN];
+    B = new float[LEN];
+
+    for(uint32_t i=0;i<LEN;i++){
+        A[i] = i*1.0f;
+        B[i] = 0.0f;
+    }
+
+
+#ifdef __HIP_PLATFORM_NVCC__
+  	hipInit(0);
+	  hipDevice_t device;
+	  hipCtx_t context;
+	  hipDeviceGet(&device, 0);
+	  hipCtxCreate(&context, 0, device);
+#endif
+
+    hipMalloc((void**)&Ad, SIZE);
+    hipMalloc((void**)&Bd, SIZE);
+
+    hipMemcpyHtoD(Ad, A, SIZE);
+    hipMemcpyHtoD(Bd, B, SIZE);
+    hipModule_t Module;
+    hipFunction_t Function;
+    hipModuleLoad(&Module, fileName);
+    hipModuleGetFunction(&Function, Module, kernel_name);
+
+  uint32_t len = LEN;
+  uint32_t one = 1;
+
+    std::vector<void*>argBuffer(5);
+    uint32_t *ptr32_t = (uint32_t*)&argBuffer[0];
+    memcpy(ptr32_t + 0, &one, sizeof(uint32_t));
+    memcpy(ptr32_t + 1, &one, sizeof(uint32_t));
+    memcpy(ptr32_t + 2, &one, sizeof(uint32_t));
+    memcpy(ptr32_t + 3, &len, sizeof(uint32_t));
+    memcpy(ptr32_t + 4, &one, sizeof(uint32_t));
+    memcpy(ptr32_t + 5, &one, sizeof(uint32_t));
+    memcpy(&argBuffer[3], &Ad, sizeof(void*));
+    memcpy(&argBuffer[4], &Bd, sizeof(void*));
+
+
+
+    size_t size = argBuffer.size()*sizeof(void*);
+
+    void *config[] = {
+      HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0],
+      HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
+      HIP_LAUNCH_PARAM_END
+    };
+
+    hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&config); 
+
+    hipMemcpyDtoH(B, Bd, SIZE);
+    for(uint32_t i=LEN-4;i<LEN;i++){
+        std::cout<<A[i]<<" - "<<B[i]<<std::endl;
+    }
+
+#ifdef __HIP_PLATFORM_NVCC__
+	  hipCtxDetach(context);
+#endif 
+
+    return 0;
+}
diff --git a/samples/0_Intro/module_api/vcpy_isa.cpp b/samples/0_Intro/module_api/vcpy_isa.cpp
new file mode 100644
index 0000000000..ead3253115
--- /dev/null
+++ b/samples/0_Intro/module_api/vcpy_isa.cpp
@@ -0,0 +1,9 @@
+#include<hip_runtime.h>
+
+__global__ void hello_world(hipLaunchParm lp, float *a, float *b)
+{
+    int tx = hipThreadIdx_x;
+    b[tx] = a[tx];
+}
+
+int main(){}
diff --git a/samples/0_Intro/module_api/vcpy_isa.cu b/samples/0_Intro/module_api/vcpy_isa.cu
new file mode 100644
index 0000000000..d2a0838604
--- /dev/null
+++ b/samples/0_Intro/module_api/vcpy_isa.cu
@@ -0,0 +1,6 @@
+
+extern "C" __global__ void hello_world(float *a, float *b)
+{
+	int tx = threadIdx.x;
+	b[tx] = a[tx];
+}
diff --git a/samples/0_Intro/module_api/vcpy_isa.ptx b/samples/0_Intro/module_api/vcpy_isa.ptx
new file mode 100644
index 0000000000..62eb3f63df
--- /dev/null
+++ b/samples/0_Intro/module_api/vcpy_isa.ptx
@@ -0,0 +1,38 @@
+//
+// Generated by NVIDIA NVVM Compiler
+//
+// Compiler Build ID: CL-19856038
+// Cuda compilation tools, release 7.5, V7.5.17
+// Based on LLVM 3.4svn
+//
+
+.version 4.3
+.target sm_20
+.address_size 64
+
+	// .globl	hello_world
+
+.visible .entry hello_world(
+	.param .u64 hello_world_param_0,
+	.param .u64 hello_world_param_1
+)
+{
+	.reg .f32 	%f<2>;
+	.reg .b32 	%r<2>;
+	.reg .b64 	%rd<8>;
+
+
+	ld.param.u64 	%rd1, [hello_world_param_0];
+	ld.param.u64 	%rd2, [hello_world_param_1];
+	cvta.to.global.u64 	%rd3, %rd2;
+	cvta.to.global.u64 	%rd4, %rd1;
+	mov.u32 	%r1, %tid.x;
+	mul.wide.s32 	%rd5, %r1, 4;
+	add.s64 	%rd6, %rd4, %rd5;
+	ld.global.f32 	%f1, [%rd6];
+	add.s64 	%rd7, %rd3, %rd5;
+	st.global.f32 	[%rd7], %f1;
+	ret;
+}
+
+

From c0cfdf2ee98a7bb6df3c6726fdd5a4d1ff704791 Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Wed, 31 Aug 2016 20:16:48 -0500
Subject: [PATCH 47/56] corrected path to hipgenisa.sh and fixed its working

Change-Id: I140055d5e800cd4b23253171ba889db5e63b637b
---
 bin/hipcc        | 3 ++-
 bin/hipgenisa.sh | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/bin/hipcc b/bin/hipcc
index b491f34665..946f5abe68 100755
--- a/bin/hipcc
+++ b/bin/hipcc
@@ -157,7 +157,8 @@ if ($HIP_PLATFORM eq "hcc") {
     exit (-1);
 }
 
-my $ISACMD="hipgenisa.sh ";
+my $ISACMD=$HIP_PATH;
+$ISACMD .= "/bin/hipgenisa.sh ";
 $ISACMD .= $ROCM_PATH;
 if($ARGV[0] eq "--genisa"){
   foreach $isaarg (@ARGV){
diff --git a/bin/hipgenisa.sh b/bin/hipgenisa.sh
index ef5417a428..496e79e34c 100755
--- a/bin/hipgenisa.sh
+++ b/bin/hipgenisa.sh
@@ -18,9 +18,11 @@ fi
 
 export KMDUMPISA=1
 export KMDUMPLLVM=1
+mkdir /tmp/hipgenisa
 hipcc $FILE_NAMES -o /tmp/hipgenisa/a.out
 mv dump.* /tmp/hipgenisa/
 ${ROCM_PATH}/hcc-lc/bin/llvm-mc -arch=amdgcn -mcpu=$TARGET -filetype=obj /tmp/hipgenisa/dump.isa -o /tmp/hipgenisa/dump.o
 ${ROCM_PATH}/llvm/bin/clang -target amdgcn--amdhsa /tmp/hipgenisa/dump.o -o $OUTPUT_FILE
+rm -r /tmp/hipgenisa
 export KMDUMPISA=0
 export KMDUMPLLVM=0

From 6a6a0ae32f2d9d194c20ea5de75e59d4195634c2 Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Wed, 31 Aug 2016 20:36:36 -0500
Subject: [PATCH 48/56] added device code offline compilation for nvcc

Change-Id: I1f77131778b4dcfcf720b4367dd9f18ffaf1cbb3
---
 bin/hipcc | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/bin/hipcc b/bin/hipcc
index 946f5abe68..3a9275de51 100755
--- a/bin/hipcc
+++ b/bin/hipcc
@@ -157,16 +157,30 @@ if ($HIP_PLATFORM eq "hcc") {
     exit (-1);
 }
 
+
 my $ISACMD=$HIP_PATH;
-$ISACMD .= "/bin/hipgenisa.sh ";
-$ISACMD .= $ROCM_PATH;
-if($ARGV[0] eq "--genisa"){
-  foreach $isaarg (@ARGV){
-    $ISACMD .= " ";
-    $ISACMD .= $isaarg;
+if($HIP_PLATFORM eq "hcc"){
+  $ISACMD .= "/bin/hipgenisa.sh ";
+  $ISACMD .= $ROCM_PATH;
+  if($ARGV[0] eq "--gencodeobject"){
+    foreach $isaarg (@ARGV){
+      $ISACMD .= " ";
+      $ISACMD .= $isaarg;
+    }
+    system($ISACMD);
+    exit(0);
+  }
+}
+
+if($HIP_PLATFORM eq "nvcc"){
+  if($ARGV[0] eq "--gencodeobject"){
+    foreach $isaarg (@ARGV){
+      $ISACMD .= " ";
+      $ISACMD .= $isarg;
+    }
+    system($ISACMD);
+    exit(0);
   }
-  system($ISACMD);
-  exit(0);
 }
 
 # Add paths to common HIP includes:

From deb2638397b155911ccfc110fbafd211e877377d Mon Sep 17 00:00:00 2001
From: Rahul Garg <rahul.garg@amd.com>
Date: Thu, 1 Sep 2016 11:00:30 +0530
Subject: [PATCH 49/56] Added HIP_INIT_API to hipDeviceSynchronize()

Change-Id: I9bd6bf206905621b17e1999994b5ea09b7382180
---
 src/hip_device.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/hip_device.cpp b/src/hip_device.cpp
index d31e3b5890..14338c11de 100644
--- a/src/hip_device.cpp
+++ b/src/hip_device.cpp
@@ -160,6 +160,7 @@ hipError_t hipSetDevice(int deviceId)
  */
 hipError_t hipDeviceSynchronize(void)
 {
+    HIP_INIT_API(1);
     return ihipSynchronize();
 }
 

From dd84bb0d849ce7e1940abd04c30d67f8c4502d75 Mon Sep 17 00:00:00 2001
From: Rahul Garg <rahul.garg@amd.com>
Date: Thu, 1 Sep 2016 11:35:42 +0530
Subject: [PATCH 50/56] Added context related difference note in
 hip_porting_driver_api.md

Change-Id: I5bd2884a16db51871baa7c19fa2bd63a0bd3adad
---
 docs/markdown/hip_porting_driver_api.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/markdown/hip_porting_driver_api.md b/docs/markdown/hip_porting_driver_api.md
index 758b0672c1..b0ac3ecf1d 100644
--- a/docs/markdown/hip_porting_driver_api.md
+++ b/docs/markdown/hip_porting_driver_api.md
@@ -119,7 +119,8 @@ the device.
 `hipModuleLaunchKernel` is `cuLaunchKernel` in HIP world. It takes the same arguments as `cuLaunchKernel`. The argument `kernelParams` is not fully implemented for HCC. The workaround for it is, to use platform specific macros for each target. Or, `extra` argument can be used which works on both the platforms.
 
 #### Additional Information
-HCC allocates staging buffers (used for unpinned copies) on a per-device basis.
+- HCC allocates staging buffers (used for unpinned copies) on a per-device basis.
+- HCC creates a primary context when the HIP API is called.  So in a pure driver API code, HIP/HCC will create a primary context while HIP/NVCC will have empty context stack.  HIP/HCC will push primary context to context stack when it is empty. This can have subtle differences on applications which mix the runtime and driver APIs.
 
 ### NVCC Implementation Notes
 

From 13d172143147e1a6431de064b5c647c5b1144295 Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Thu, 1 Sep 2016 12:10:31 +0530
Subject: [PATCH 51/56] module_api/Makefile: Use gencodeobject instead of
 genisa

Change-Id: I7e3523810f5603ad727b1fda7ff2d0dc53ec72d7
---
 samples/0_Intro/module_api/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/0_Intro/module_api/Makefile b/samples/0_Intro/module_api/Makefile
index b58882b0fd..582899b6df 100644
--- a/samples/0_Intro/module_api/Makefile
+++ b/samples/0_Intro/module_api/Makefile
@@ -10,7 +10,7 @@ runKernel.cuda.out: runKernel.cpp
 	nvcc runKernel.cpp -o $@
 
 vcpy_isa.compile: vcpy_isa.cpp
-	$(HIPCC) --genisa --target-isa=fiji vcpy_isa.cpp -o vcpy_isa.co	
+	$(HIPCC) --gencodeobject --target-isa=fiji vcpy_isa.cpp -o vcpy_isa.co
 
 runKernel.hip.out: runKernel.cpp
 	$(HIPCC) runKernel.cpp -o runKernel.hip.out

From 2d1d6b2973e1e91e5e8909d59394a09574f5b3c3 Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Thu, 1 Sep 2016 12:11:11 +0530
Subject: [PATCH 52/56] Fix typo in hipcc

Change-Id: I35817c08b503f438f412d4bfe84afc11b671dc77
---
 bin/hipcc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/hipcc b/bin/hipcc
index 3a9275de51..9429fc611a 100755
--- a/bin/hipcc
+++ b/bin/hipcc
@@ -176,7 +176,7 @@ if($HIP_PLATFORM eq "nvcc"){
   if($ARGV[0] eq "--gencodeobject"){
     foreach $isaarg (@ARGV){
       $ISACMD .= " ";
-      $ISACMD .= $isarg;
+      $ISACMD .= $isaarg;
     }
     system($ISACMD);
     exit(0);

From 5456668776d2097ead89c81bec6d7be36cf6a7d0 Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Thu, 1 Sep 2016 12:13:11 +0530
Subject: [PATCH 53/56] hipgenisa.sh looks for hipcc relative to itself

Change-Id: Iac63fb5a87db1d735dbfa9697ce1f769a36b4d4a
---
 bin/hipgenisa.sh | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/bin/hipgenisa.sh b/bin/hipgenisa.sh
index 496e79e34c..5ccee14a8f 100755
--- a/bin/hipgenisa.sh
+++ b/bin/hipgenisa.sh
@@ -16,13 +16,16 @@ then
   TARGET=${GEN_ISA:13:12}
 fi
 
+SOURCE="${BASH_SOURCE[0]}"
+HIP_PATH="$( command cd -P "$( dirname "$SOURCE" )/.." && pwd )"
+
 export KMDUMPISA=1
 export KMDUMPLLVM=1
 mkdir /tmp/hipgenisa
-hipcc $FILE_NAMES -o /tmp/hipgenisa/a.out
+$HIP_PATH/bin/hipcc $FILE_NAMES -o /tmp/hipgenisa/a.out
 mv dump.* /tmp/hipgenisa/
-${ROCM_PATH}/hcc-lc/bin/llvm-mc -arch=amdgcn -mcpu=$TARGET -filetype=obj /tmp/hipgenisa/dump.isa -o /tmp/hipgenisa/dump.o
-${ROCM_PATH}/llvm/bin/clang -target amdgcn--amdhsa /tmp/hipgenisa/dump.o -o $OUTPUT_FILE
+$ROCM_PATH/hcc-lc/bin/llvm-mc -arch=amdgcn -mcpu=$TARGET -filetype=obj /tmp/hipgenisa/dump.isa -o /tmp/hipgenisa/dump.o
+$ROCM_PATH/llvm/bin/clang -target amdgcn--amdhsa /tmp/hipgenisa/dump.o -o $OUTPUT_FILE
 rm -r /tmp/hipgenisa
 export KMDUMPISA=0
 export KMDUMPLLVM=0

From 94f035d80db09122751cc5ffd19bf0df5796f1f8 Mon Sep 17 00:00:00 2001
From: Rahul Garg <rahul.garg@amd.com>
Date: Thu, 1 Sep 2016 12:19:41 +0530
Subject: [PATCH 54/56] Added doxygen documentation for hipCtxXXX APIs

Change-Id: Id9cf3491e9a8c78cb7bfeb9976c36d6fe565628f
---
 include/hcc_detail/hip_runtime_api.h | 111 ++++++++++++++++++++++++++-
 src/hip_context.cpp                  |  72 +++++++++++++++--
 2 files changed, 173 insertions(+), 10 deletions(-)

diff --git a/include/hcc_detail/hip_runtime_api.h b/include/hcc_detail/hip_runtime_api.h
index 0114a0772b..fbcca3d12f 100644
--- a/include/hcc_detail/hip_runtime_api.h
+++ b/include/hcc_detail/hip_runtime_api.h
@@ -1062,34 +1062,137 @@ hipError_t hipMemcpyPeerAsync(void* dst, int dstDevice, const void* src, int src
 hipError_t hipInit(unsigned int flags) ;
 
 
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ *  @defgroup Context Management
+ *  @{
+ */
 
-// TODO-ctx
+/**
+ * @brief Create a context and set it as current/ default context
+ *
+ * @param [out] ctx
+ * @param [in] flags
+ * @param [in] associated device handle
+ *
+ * @returns #hipSuccess, #hipErrorInvalidContext
+ */
 hipError_t hipCtxCreate(hipCtx_t *ctx, unsigned int flags,  hipDevice_t device);
 
 hipError_t hipCtxDestroy(hipCtx_t ctx);
 
+/**
+ * @brief Pop the current/default context and return the popped context.
+ *
+ * @param [out] ctx
+ *
+ * @returns #hipSuccess
+ */
+
 hipError_t hipCtxPopCurrent(hipCtx_t* ctx);
 
+/**
+ * @brief Push the context to be set as current/ default context
+ *
+ * @param [in] ctx
+ *
+ * @returns #hipSuccess, #hipErrorInvalidContext
+ */
+
 hipError_t hipCtxPushCurrent(hipCtx_t ctx);
 
+/**
+ * @brief Set the passed context as current/default
+ *
+ * @param [in] ctx
+ *
+ * @returns #hipSuccess
+ */
+
 hipError_t hipCtxSetCurrent(hipCtx_t ctx);
 
+/**
+ * @brief Get the handle of the current/ default context
+ *
+ * @param [out] ctx
+ *
+ * @returns #hipSuccess
+ */
+
 hipError_t hipCtxGetCurrent(hipCtx_t* ctx);
 
+/**
+ * @brief Get the handle of the device associated with current/default context
+ *
+ * @param [out] device
+ *
+ * @returns #hipSuccess, #hipErrorInvalidContext
+ */
+
 hipError_t hipCtxGetDevice(hipDevice_t *device);
 
+/**
+ * @brief Returns the approximate HIP api version.
+ *
+ * @warning The HIP feature set does not correspond to an exact CUDA SDK api revision.
+ * This function always set *apiVersion to 4 as an approximation though HIP supports
+ * some features which were introduced in later CUDA SDK revisions.
+ * HIP apps code should not rely on the api revision number here and should
+ * use arch feature flags to test device capabilities or conditional compilation.
+ *
+ */
 hipError_t hipCtxGetApiVersion (hipCtx_t ctx,int *apiVersion);
 
+/**
+ * @brief Set Cache configuration for a specific function
+ *
+ * Note: AMD devices and recent Nvidia GPUS do not support reconfigurable cache.  This hint is ignored on those architectures.
+ *
+ */
 hipError_t hipCtxGetCacheConfig ( hipFuncCache *cacheConfig );
 
+/**
+ * @brief Set L1/Shared cache partition.
+ *
+ * Note: AMD devices and recent Nvidia GPUS do not support reconfigurable cache.  This hint is ignored on those architectures.
+ *
+ */
 hipError_t hipCtxSetCacheConfig ( hipFuncCache cacheConfig );
 
+/**
+ * @brief Set Shared memory bank configuration.
+ *
+ * Note: AMD devices and recent Nvidia GPUS do not support shared cache banking, and the hint is ignored on those architectures.
+ *
+ */
 hipError_t hipCtxSetSharedMemConfig ( hipSharedMemConfig config );
 
+/**
+ * @brief Get Shared memory bank configuration.
+ *
+ * Note: AMD devices and recent Nvidia GPUS do not support shared cache banking, and the hint is ignored on those architectures.
+ *
+ */
 hipError_t hipCtxGetSharedMemConfig ( hipSharedMemConfig * pConfig );
 
+/**
+ * @brief Blocks until the default context has completed all preceding requested tasks.
+ *
+ * This function waits for all streams on the default context to complete execution, and then returns.
+ *
+ * @returns #hipSuccess.
+*/
 hipError_t hipCtxSynchronize ( void );
 
+/**
+ * @brief Get flags used for creating current/default context.
+ *
+ * @param [out] flags
+ *
+ * @returns #hipSuccess.
+*/
+
 hipError_t hipCtxGetFlags ( unsigned int* flags );
 
 /**
@@ -1110,7 +1213,7 @@ hipError_t hipCtxGetFlags ( unsigned int* flags );
 hipError_t  hipCtxEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags);
 
 /**
- * @brief Disable direct access from current device's virtual address space to memory allocations physically located on a peer device through contex.Disables direct access to memory allocations in a peer context and unregisters any registered allocations.
+ * @brief Disable direct access from current context's virtual address space to memory allocations physically located on a peer context.Disables direct access to memory allocations in a peer context and unregisters any registered allocations.
  *
  * Returns hipErrorPeerAccessNotEnabled if direct access to memory on peerDevice has not yet been enabled from the current device.
  *
@@ -1120,6 +1223,10 @@ hipError_t  hipCtxEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags);
  * @warning PeerToPeer support is experimental.
  */
 hipError_t  hipCtxDisablePeerAccess (hipCtx_t peerCtx);
+// doxygen end Context Management
+/**
+ * @}
+ */
 
 
 // TODO-ctx
diff --git a/src/hip_context.cpp b/src/hip_context.cpp
index f1d9ff8387..e19c45d2c3 100644
--- a/src/hip_context.cpp
+++ b/src/hip_context.cpp
@@ -41,6 +41,10 @@ hipError_t ihipCtxStackUpdate()
     return ihipLogStatus(e);
 }
 
+/**
+ * @return #hipSuccess, #hipErrorInvalidValue
+ */
+//---
 hipError_t hipInit(unsigned int flags)
 {
     HIP_INIT_API(flags);
@@ -55,7 +59,10 @@ hipError_t hipInit(unsigned int flags)
     return ihipLogStatus(e);
 }
 
-
+/**
+ * @return #hipSuccess
+ */
+//---
 hipError_t hipCtxCreate(hipCtx_t *ctx, unsigned int flags,  hipDevice_t device)
 {
     HIP_INIT_API(ctx, flags, device); // FIXME - review if we want to init
@@ -68,7 +75,10 @@ hipError_t hipCtxCreate(hipCtx_t *ctx, unsigned int flags,  hipDevice_t device)
     return ihipLogStatus(e);
 }
 
-
+/**
+ * @return #hipSuccess, #hipErrorInvalidDevice
+ */
+//---
 hipError_t hipDeviceGet(hipDevice_t *device, int deviceId)
 {
     HIP_INIT_API(device, deviceId); // FIXME - review if we want to init
@@ -99,6 +109,10 @@ hipError_t hipDriverGetVersion(int *driverVersion)
     return ihipLogStatus(hipSuccess);
 }
 
+/**
+ * @return #hipSuccess, #hipErrorInvalidValue
+ */
+//---
 hipError_t hipCtxDestroy(hipCtx_t ctx)
 {
     HIP_INIT_API(ctx);
@@ -121,6 +135,10 @@ hipError_t hipCtxDestroy(hipCtx_t ctx)
     return ihipLogStatus(e);
 }
 
+/**
+ * @return #hipSuccess
+ */
+//---
 hipError_t hipCtxPopCurrent(hipCtx_t* ctx)
 {
     HIP_INIT_API(ctx);
@@ -141,6 +159,10 @@ hipError_t hipCtxPopCurrent(hipCtx_t* ctx)
     return ihipLogStatus(e);
 }
 
+/**
+ * @return #hipSuccess, #hipErrorInvalidContext
+ */
+//---
 hipError_t hipCtxPushCurrent(hipCtx_t ctx)
 {
     HIP_INIT_API(ctx);
@@ -155,16 +177,14 @@ hipError_t hipCtxPushCurrent(hipCtx_t ctx)
     return ihipLogStatus(e);
 }
 
+/**
+ * @return #hipSuccess
+ */
+//---
 hipError_t hipCtxGetCurrent(hipCtx_t* ctx)
 {
     HIP_INIT_API(ctx);
     hipError_t e = hipSuccess;
-#if 0
-    *ctx = ihipGetTlsDefaultCtx();
-    if(*ctx == nullptr) {
-        *ctx = NULL;    //TODO - is it required? Can return nullptr?
-    }
-#endif
     if(!tls_ctxStack.empty()) {
         *ctx= tls_ctxStack.top();
     }
@@ -174,6 +194,10 @@ hipError_t hipCtxGetCurrent(hipCtx_t* ctx)
     return ihipLogStatus(e);
 }
 
+/**
+ * @return #hipSuccess
+ */
+//---
 hipError_t hipCtxSetCurrent(hipCtx_t ctx)
 {
     HIP_INIT_API(ctx);
@@ -188,6 +212,10 @@ hipError_t hipCtxSetCurrent(hipCtx_t ctx)
     return ihipLogStatus(e);
 }
 
+/**
+ * @return #hipSuccess, #hipErrorInvalidContext
+ */
+//---
 hipError_t hipCtxGetDevice(hipDevice_t *device)
 {
     HIP_INIT_API(device);
@@ -204,6 +232,10 @@ hipError_t hipCtxGetDevice(hipDevice_t *device)
     return ihipLogStatus(e);
 }
 
+/**
+ * @return #hipSuccess
+ */
+//---
 hipError_t hipCtxGetApiVersion (hipCtx_t ctx,int *apiVersion)
 {
     HIP_INIT_API(apiVersion);
@@ -215,6 +247,10 @@ hipError_t hipCtxGetApiVersion (hipCtx_t ctx,int *apiVersion)
     return ihipLogStatus(hipSuccess);
 }
 
+/**
+ * @return #hipSuccess
+ */
+//---
 hipError_t hipCtxGetCacheConfig ( hipFuncCache *cacheConfig )
 {
     HIP_INIT_API(cacheConfig);
@@ -224,6 +260,10 @@ hipError_t hipCtxGetCacheConfig ( hipFuncCache *cacheConfig )
     return ihipLogStatus(hipSuccess);
 }
 
+/**
+ * @return #hipSuccess
+ */
+//---
 hipError_t hipCtxSetCacheConfig ( hipFuncCache cacheConfig )
 {
     HIP_INIT_API(cacheConfig);
@@ -233,6 +273,10 @@ hipError_t hipCtxSetCacheConfig ( hipFuncCache cacheConfig )
     return ihipLogStatus(hipSuccess);
 }
 
+/**
+ * @return #hipSuccess
+ */
+//---
 hipError_t hipCtxSetSharedMemConfig ( hipSharedMemConfig config )
 {
     HIP_INIT_API(config);
@@ -242,6 +286,10 @@ hipError_t hipCtxSetSharedMemConfig ( hipSharedMemConfig config )
     return ihipLogStatus(hipSuccess);
 }
 
+/**
+ * @return #hipSuccess
+ */
+//---
 hipError_t hipCtxGetSharedMemConfig ( hipSharedMemConfig * pConfig )
 {
     HIP_INIT_API(pConfig);
@@ -251,12 +299,20 @@ hipError_t hipCtxGetSharedMemConfig ( hipSharedMemConfig * pConfig )
     return ihipLogStatus(hipSuccess);
 }
 
+/**
+ * @return #hipSuccess
+ */
+//---
 hipError_t hipCtxSynchronize ( void )
 {
     HIP_INIT_API(1);
     return ihipSynchronize(); //TODP Shall check validity of ctx?
 }
 
+/**
+ * @return #hipSuccess
+ */
+//---
 hipError_t hipCtxGetFlags ( unsigned int* flags )
 {
     HIP_INIT_API(flags);

From 14b0fc80c7372a7a74766d3de8a8b3ba2572d66f Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Thu, 1 Sep 2016 15:06:36 +0530
Subject: [PATCH 55/56] Updates to gencodeobject generation logic

    - hipcc honors HIPCC_VERBOSE for gencodeobject too
    - fixed gencodeobject generation logic for nvcc path
    - hipgenisa.sh is no longer passed the --gencodeobject flag

Change-Id: Iab83c18a6decef445fda8468ba5de10cd7935046
---
 bin/hipcc        | 59 +++++++++++++++++++++++++++---------------------
 bin/hipgenisa.sh |  8 +++----
 2 files changed, 37 insertions(+), 30 deletions(-)

diff --git a/bin/hipcc b/bin/hipcc
index 9429fc611a..8fde7f426d 100755
--- a/bin/hipcc
+++ b/bin/hipcc
@@ -157,32 +157,6 @@ if ($HIP_PLATFORM eq "hcc") {
     exit (-1);
 }
 
-
-my $ISACMD=$HIP_PATH;
-if($HIP_PLATFORM eq "hcc"){
-  $ISACMD .= "/bin/hipgenisa.sh ";
-  $ISACMD .= $ROCM_PATH;
-  if($ARGV[0] eq "--gencodeobject"){
-    foreach $isaarg (@ARGV){
-      $ISACMD .= " ";
-      $ISACMD .= $isaarg;
-    }
-    system($ISACMD);
-    exit(0);
-  }
-}
-
-if($HIP_PLATFORM eq "nvcc"){
-  if($ARGV[0] eq "--gencodeobject"){
-    foreach $isaarg (@ARGV){
-      $ISACMD .= " ";
-      $ISACMD .= $isaarg;
-    }
-    system($ISACMD);
-    exit(0);
-  }
-}
-
 # Add paths to common HIP includes:
 $HIPCXXFLAGS .= " -I$HIP_PATH/include" ;
 
@@ -202,6 +176,39 @@ if ($verbose & 0x4) {
     print "hipcc-args: ", join (" ", @ARGV), "\n";
 }
 
+# Handle code object generation
+my $ISACMD="";
+if($HIP_PLATFORM eq "hcc"){
+  $ISACMD .= "$HIP_PATH/bin/hipgenisa.sh ";
+  $ISACMD .= $ROCM_PATH;
+  if($ARGV[0] eq "--gencodeobject"){
+    foreach $isaarg (@ARGV[1..$#ARGV]){
+      $ISACMD .= " ";
+      $ISACMD .= $isaarg;
+    }
+    if ($verbose & 0x1) {
+      print "hipcc-cmd: ", $ISACMD, "\n";
+    }
+    system($ISACMD) and die();
+    exit(0);
+  }
+}
+
+if($HIP_PLATFORM eq "nvcc"){
+  $ISACMD .= "$HIP_PATH/bin/hipcc -ptx ";
+  if($ARGV[0] eq "--gencodeobject"){
+    foreach $isaarg (@ARGV[1..$#ARGV]){
+      $ISACMD .= " ";
+      $ISACMD .= $isaarg;
+    }
+    if ($verbose & 0x1) {
+      print "hipcc-cmd: ", $ISACMD, "\n";
+    }
+    system($ISACMD) and die();
+    exit(0);
+  }
+}
+
 my $toolArgs = "";  # arguments to pass to the hcc or nvcc tool
 
 foreach $arg (@ARGV)
diff --git a/bin/hipgenisa.sh b/bin/hipgenisa.sh
index 5ccee14a8f..e60abbf78f 100755
--- a/bin/hipgenisa.sh
+++ b/bin/hipgenisa.sh
@@ -6,10 +6,10 @@ exit
 fi
 
 ROCM_PATH=$1
-GEN_ISA=$3
-FILE_NAMES=$4
-OUT=$5
-OUTPUT_FILE=$6
+GEN_ISA=$2
+FILE_NAMES=$3
+OUT=$4
+OUTPUT_FILE=$5
 TARGET=""
 if [ ${GEN_ISA:0:12} = "--target-isa" ]
 then

From 811df250e64c5e4155fa1464883f465735945657 Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Thu, 1 Sep 2016 15:11:12 +0530
Subject: [PATCH 56/56] Fixed module_api/Makefile to set flags based on
 HIP_PLATFORM

Change-Id: I2fa9a556e0c4f25f4963ecef1d25eb922f9af1b9
---
 samples/0_Intro/module_api/Makefile | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/samples/0_Intro/module_api/Makefile b/samples/0_Intro/module_api/Makefile
index 582899b6df..25ae7b8411 100644
--- a/samples/0_Intro/module_api/Makefile
+++ b/samples/0_Intro/module_api/Makefile
@@ -3,14 +3,16 @@ ifeq (,$(HIP_PATH))
 	HIP_PATH=../../..
 endif
 HIPCC=$(HIP_PATH)/bin/hipcc
+HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --compiler)
+
+ifeq (${HIP_PLATFORM}, hcc)
+	GENCODEOBJECT_FLAGS=--target-isa-fiji
+endif
 
 all: vcpy_isa.compile runKernel.hip.out
 
-runKernel.cuda.out: runKernel.cpp
-	nvcc runKernel.cpp -o $@
-
 vcpy_isa.compile: vcpy_isa.cpp
-	$(HIPCC) --gencodeobject --target-isa=fiji vcpy_isa.cpp -o vcpy_isa.co
+	$(HIPCC) --gencodeobject $(GENCODEOBJECT_FLAGS) vcpy_isa.cpp -o vcpy_isa.co
 
 runKernel.hip.out: runKernel.cpp
 	$(HIPCC) runKernel.cpp -o runKernel.hip.out